From 61d7d67c39c90a286a5ece66ae61480210955210 Mon Sep 17 00:00:00 2001
From: Zach Lowry <zach.lowry@watershedlrs.com>
Date: Wed, 23 Oct 2019 12:37:14 -0500
Subject: [PATCH 001/178] Use argument value for 'mat' in call to format for
 vector_mat and vector_mat_template

The hard-coded string value "Mat" was used in the two format strings for vector_mat and vector_mat_template, preventing UMat arguments to functions that have these types from working correctly. Notably, #12231 references this issue.
---
 modules/python/src2/hdr_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index c2b206049b..96919a9d0a 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -571,8 +571,8 @@ class CppHeaderParser(object):
                     arg_type, arg_name, modlist, argno = self.parse_arg(a, argno)
                     if self.wrap_mode:
                         # TODO: Vectors should contain UMat, but this is not very easy to support and not very needed
-                        vector_mat = "vector_{}".format("Mat")
-                        vector_mat_template = "vector<{}>".format("Mat")
+                        vector_mat = "vector_{}".format(mat)
+                        vector_mat_template = "vector<{}>".format(mat)
 
                         if arg_type == "InputArray":
                             arg_type = mat

From 0d32a24cba1d00fe3e305593c3bd7b352cb3763f Mon Sep 17 00:00:00 2001
From: lamm45 <96844552+lamm45@users.noreply.github.com>
Date: Sun, 10 Apr 2022 13:13:30 -0400
Subject: [PATCH 002/178] Fix some doc references in geometric image
 transformations module

---
 modules/imgproc/include/opencv2/imgproc.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 9425ba88d5..a21c9227af 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -118,7 +118,7 @@ sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_
 where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
 f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
 interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
-resize for details.
+#resize for details.
 
 @note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
 
@@ -2265,7 +2265,7 @@ way:
     resize(src, dst, Size(), 0.5, 0.5, interpolation);
 @endcode
 To shrink an image, it will generally look best with #INTER_AREA interpolation, whereas to
-enlarge an image, it will generally look best with c#INTER_CUBIC (slow) or #INTER_LINEAR
+enlarge an image, it will generally look best with #INTER_CUBIC (slow) or #INTER_LINEAR
 (faster but still looks OK).
 
 @param src input image.
@@ -2357,7 +2357,7 @@ The function remap transforms the source image using the specified map:
 where values of pixels with non-integer coordinates are computed using one of available
 interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
 in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
-\f$map_1\f$, or fixed-point maps created by using convertMaps. The reason you might want to
+\f$map_1\f$, or fixed-point maps created by using #convertMaps. The reason you might want to
 convert from floating to fixed-point representations of a map is that they can yield much faster
 (\~2x) remapping operations. In the converted case, \f$map_1\f$ contains pairs (cvFloor(x),
 cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
@@ -2367,7 +2367,7 @@ This function cannot operate in-place.
 @param src Source image.
 @param dst Destination image. It has the same size as map1 and the same type as src .
 @param map1 The first map of either (x,y) points or just x values having the type CV_16SC2 ,
-CV_32FC1, or CV_32FC2. See convertMaps for details on converting a floating point
+CV_32FC1, or CV_32FC2. See #convertMaps for details on converting a floating point
 representation to fixed-point for speed.
 @param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
 if map1 is (x,y) points), respectively.
@@ -2392,7 +2392,7 @@ options ( (map1.type(), map2.type()) \f$\rightarrow\f$ (dstmap1.type(), dstmap2.
 supported:
 
 - \f$\texttt{(CV_32FC1, CV_32FC1)} \rightarrow \texttt{(CV_16SC2, CV_16UC1)}\f$. This is the
-most frequently used conversion operation, in which the original floating-point maps (see remap )
+most frequently used conversion operation, in which the original floating-point maps (see #remap)
 are converted to a more compact and much faster fixed-point representation. The first output array
 contains the rounded coordinates and the second array (created only when nninterpolation=false )
 contains indices in the interpolation tables.

From 2958142e315c912fcf9fa6c0d42e9fd8d0f3a2df Mon Sep 17 00:00:00 2001
From: Qingnan Duan <duanqn_own_1@yeah.net>
Date: Mon, 18 Apr 2022 14:18:27 +0800
Subject: [PATCH 003/178] Remove extra not in doc

---
 modules/core/include/opencv2/core/mat.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 919fa0b6f9..54eee9bfa8 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -1030,7 +1030,7 @@ public:
     @param copyData Flag to specify whether the underlying data of the STL vector should be copied
     to (true) or shared with (false) the newly constructed matrix. When the data is copied, the
     allocated buffer is managed using Mat reference counting mechanism. While the data is shared,
-    the reference counter is NULL, and you should not deallocate the data until the matrix is not
+    the reference counter is NULL, and you should not deallocate the data until the matrix is
     destructed.
     */
     template<typename _Tp> explicit Mat(const std::vector<_Tp>& vec, bool copyData=false);

From 87e98e87888bd2089f25ca12dba0ac7a738eeea0 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Sun, 24 Apr 2022 01:44:15 +0300
Subject: [PATCH 004/178] Fixed url for a fork

---
 .github/workflows/PR-3.4-U20.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
index 8c4f0e90e2..1c4cb91b36 100644
--- a/.github/workflows/PR-3.4-U20.yaml
+++ b/.github/workflows/PR-3.4-U20.yaml
@@ -10,6 +10,7 @@ env:
   OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
   OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
   PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
   ANT_HOME: '/usr/share/ant'
@@ -43,7 +44,7 @@ jobs:
         git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
       run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_extra.git /opencv_extra
     - name: Configure OpenCV
@@ -154,7 +155,7 @@ jobs:
         git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_contrib
       run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_contrib.git /opencv_contrib
     - name: Configure OpenCV Contrib

From 88123e2512e7800cac49d6821cdefa90e4eb9434 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Sun, 24 Apr 2022 01:53:53 +0300
Subject: [PATCH 005/178] Fixed url for a fork (4.x branch)

---
 .github/workflows/PR-4.x-U20.yaml         |  5 +++--
 .github/workflows/timvx_backend_tests.yml | 17 +++++++----------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/PR-4.x-U20.yaml b/.github/workflows/PR-4.x-U20.yaml
index 978a3a4301..a4e3e2b212 100644
--- a/.github/workflows/PR-4.x-U20.yaml
+++ b/.github/workflows/PR-4.x-U20.yaml
@@ -10,6 +10,7 @@ env:
   OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
   OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
   PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
   ANT_HOME: '/usr/share/ant'
@@ -43,7 +44,7 @@ jobs:
         git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
       run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_extra.git /opencv_extra
     - name: Configure OpenCV
@@ -150,7 +151,7 @@ jobs:
         git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_contrib
       run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_contrib.git /opencv_contrib
     - name: Configure OpenCV Contrib
diff --git a/.github/workflows/timvx_backend_tests.yml b/.github/workflows/timvx_backend_tests.yml
index c45a86f8b2..2fc3ee63c2 100644
--- a/.github/workflows/timvx_backend_tests.yml
+++ b/.github/workflows/timvx_backend_tests.yml
@@ -5,16 +5,17 @@ on:
     branches: [ 4.x ]
     types: [ labeled, opened, synchronize, reopened ]
 
+env:
+  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
+  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
+  TARGET_BRANCH_NAME: ${{ github.base_ref }}
 
 jobs:
   x86-simulator-build-test:
     runs-on: ubuntu-20.04
     # Docker image from https://hub.docker.com/r/yuentau/ocv_ubuntu
     container: docker.io/yuentau/ocv_ubuntu:20.04
-    env:
-      PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-      SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-      TARGET_BRANCH_NAME: ${{ github.base_ref }}
     steps:
       - name: info
         run: |
@@ -37,7 +38,7 @@ jobs:
           cd opencv
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}" --allow-unrelated-histories
+          git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
       - name: configure
         run: |
           cmake -B build -DWITH_TIMVX=ON -DCMAKE_INSTALL_PREFIX=./install -DBUILD_SHARED_LIBS=ON -DBUILD_PERF_TESTS=ON -DBUILD_TESTS=ON -DBUILD_EXAMPLES=OFF -DBUILD_DOCS=OFF -DWITH_OPENCL=OFF opencv
@@ -50,10 +51,6 @@ jobs:
       group: khadas-vim3
       cancel-in-progress: false
     runs-on: [self-hosted, Linux, ARM64, khadas-vim3]
-    env:
-      PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-      SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-      TARGET_BRANCH_NAME: ${{ github.base_ref }}
     steps:
       - name: info
         run: |
@@ -76,7 +73,7 @@ jobs:
           cd opencv
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv" "${{ env.SOURCE_BRANCH_NAME }}" --allow-unrelated-histories
+          git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
       - name: fetch opencv_extra
         uses: actions/checkout@v3
         with:

From 8c42dbf71c424fbd500fa8ada772e4371d52abaf Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 26 Apr 2022 14:05:21 +0300
Subject: [PATCH 006/178] opencv_extra fork usage in Github Actions

---
 .github/workflows/PR-3.4-U20.yaml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
index 1c4cb91b36..e87deddd9d 100644
--- a/.github/workflows/PR-3.4-U20.yaml
+++ b/.github/workflows/PR-3.4-U20.yaml
@@ -46,7 +46,19 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_extra.git /opencv_extra
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
+    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$RET" ]]; then
+          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_extra
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
     - name: Configure OpenCV
       run: |
         cd /opencv-build

From ec4015d73c6d6fa1df0a535ea6350bb9ed2cd8d0 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 26 Apr 2022 14:20:44 +0300
Subject: [PATCH 007/178] opencv_extra fork usage in Github Actions (4.x
 branch)

---
 .github/workflows/PR-4.x-U20.yaml         | 14 +++++++++++++-
 .github/workflows/timvx_backend_tests.yml |  4 ++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/PR-4.x-U20.yaml b/.github/workflows/PR-4.x-U20.yaml
index a4e3e2b212..ed8afddd2a 100644
--- a/.github/workflows/PR-4.x-U20.yaml
+++ b/.github/workflows/PR-4.x-U20.yaml
@@ -46,7 +46,19 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_extra.git /opencv_extra
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
+    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$RET" ]]; then
+          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_extra
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
     - name: Configure OpenCV
       run: |
         cd /opencv-build
diff --git a/.github/workflows/timvx_backend_tests.yml b/.github/workflows/timvx_backend_tests.yml
index 2fc3ee63c2..5007a32755 100644
--- a/.github/workflows/timvx_backend_tests.yml
+++ b/.github/workflows/timvx_backend_tests.yml
@@ -82,12 +82,12 @@ jobs:
       - name: merge opencv_extra with test branch
         shell: bash
         run: |
-          RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}")
+          RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
           if [[ ! -z "$RET" ]]; then
             cd opencv_extra
             git config user.email "opencv.ci"
             git config user.name "opencv.ci"
-            git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}" --allow-unrelated-histories
+            git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
           else
             echo "no merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
           fi

From 2402fa482412f4662c63491894c14e240d1a12a3 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 28 Apr 2022 12:18:26 +0300
Subject: [PATCH 008/178] Fix #21894: Wrap constructor to Python to create
 initialized cuda::BufferPool object.

---
 modules/core/include/opencv2/core/cuda.hpp |  2 +-
 modules/python/test/test_cuda.py           | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 719003f21f..09220cd6b2 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -689,7 +689,7 @@ class CV_EXPORTS_W BufferPool
 public:
 
     //! Gets the BufferPool for the given stream.
-    explicit BufferPool(Stream& stream);
+    CV_WRAP explicit BufferPool(Stream& stream);
 
     //! Allocates a new GpuMat of given size and type.
     CV_WRAP GpuMat getBuffer(int rows, int cols, int type);
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index 4b3fc7d278..a047a8b2f4 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -45,5 +45,15 @@ class cuda_test(NewOpenCVTests):
         asyncstream = cv.cuda_Stream(1)  # cudaStreamNonBlocking
         self.assertTrue(asyncstream.cudaPtr() != 0)
 
+    def test_cuda_buffer_pool(self):
+        cv.cuda.setBufferPoolUsage(True)
+        cv.cuda.setBufferPoolConfig(cv.cuda.getDevice(), 1024 * 1024 * 64, 2)
+        stream_a = cv.cuda.Stream()
+        pool_a = cv.cuda.BufferPool(stream_a)
+        cuMat = pool_a.getBuffer(1024, 1024, cv.CV_8UC3)
+        cv.cuda.setBufferPoolUsage(False)
+        self.assertEqual(cuMat.size(), (1024, 1024))
+        self.assertEqual(cuMat.type(), cv.CV_8UC3)
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()

From a848eccfc6913bbca1d896a45d8121c8ab6cdd1b Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 28 Apr 2022 16:19:00 +0300
Subject: [PATCH 009/178] opencv_contrib fork usage in Github Actions (3.4
 branch)

---
 .github/workflows/PR-3.4-U20.yaml | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
index e87deddd9d..2446a07478 100644
--- a/.github/workflows/PR-3.4-U20.yaml
+++ b/.github/workflows/PR-3.4-U20.yaml
@@ -49,8 +49,8 @@ jobs:
       run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
-        RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$RET" ]]; then
+        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
           echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
           cd /opencv_extra
           git config user.email "opencv.ci"
@@ -169,7 +169,19 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_contrib
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --depth 1 https://github.com/opencv/opencv_contrib.git /opencv_contrib
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
+    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
+          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_contrib
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
     - name: Configure OpenCV Contrib
       run: |
         cd /opencv-contrib-build

From 6d0f1275c2951d9889171243615a93f4d77ab63f Mon Sep 17 00:00:00 2001
From: Kataev Victor <kataev.victor.1234@gmail.com>
Date: Thu, 28 Apr 2022 21:46:52 +0300
Subject: [PATCH 010/178] Add use of R and P parameters to cv::undistortPoints
 test

---
 .../calib3d/test/test_undistort_points.cpp    | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/modules/calib3d/test/test_undistort_points.cpp b/modules/calib3d/test/test_undistort_points.cpp
index 1ac18dedba..962ff03cca 100644
--- a/modules/calib3d/test/test_undistort_points.cpp
+++ b/modules/calib3d/test/test_undistort_points.cpp
@@ -14,6 +14,7 @@ protected:
     -1, 5), Point3f pmax = Point3f(1, 1, 10));
     void generateCameraMatrix(Mat& cameraMatrix);
     void generateDistCoeffs(Mat& distCoeffs, int count);
+    cv::Mat generateRotationVector();
 
     double thresh = 1.0e-2;
 };
@@ -50,6 +51,14 @@ void UndistortPointsTest::generateDistCoeffs(Mat& distCoeffs, int count)
         distCoeffs.at<double>(i,0) = theRNG().uniform(-0.1, 0.1);
 }
 
+cv::Mat UndistortPointsTest::generateRotationVector()
+{
+    Mat rvec(1, 3, CV_64F);
+    theRNG().fill(rvec, RNG::UNIFORM, -0.2, 0.2);
+
+    return rvec;
+}
+
 TEST_F(UndistortPointsTest, accuracy)
 {
     Mat intrinsics, distCoeffs;
@@ -58,29 +67,31 @@ TEST_F(UndistortPointsTest, accuracy)
     vector<Point3f> points(500);
     generate3DPointCloud(points);
 
-    vector<Point2f> projectedPoints;
-    projectedPoints.resize(points.size());
+    Mat rvec = generateRotationVector();
+    Mat R;
+    cv::Rodrigues(rvec, R);
+
 
     int modelMembersCount[] = {4,5,8};
     for (int idx = 0; idx < 3; idx++)
     {
         generateDistCoeffs(distCoeffs, modelMembersCount[idx]);
 
+        /* Project points with distortion */
+        vector<Point2f> projectedPoints;
         projectPoints(Mat(points), Mat::zeros(3,1,CV_64FC1),
                       Mat::zeros(3,1,CV_64FC1), intrinsics,
                       distCoeffs, projectedPoints);
 
+        /* Project points without distortion */
         vector<Point2f> realUndistortedPoints;
-        projectPoints(Mat(points), Mat::zeros(3,1,CV_64FC1),
+        projectPoints(Mat(points), rvec,
                       Mat::zeros(3,1,CV_64FC1), intrinsics,
                       Mat::zeros(4,1,CV_64FC1), realUndistortedPoints);
 
+        /* Undistort points */
         Mat undistortedPoints;
-        undistortPoints(Mat(projectedPoints), undistortedPoints, intrinsics, distCoeffs);
-
-        Mat p;
-        perspectiveTransform(undistortedPoints, p, intrinsics);
-        undistortedPoints = p;
+        undistortPoints(Mat(projectedPoints), undistortedPoints, intrinsics, distCoeffs, R, intrinsics);
 
         EXPECT_MAT_NEAR(realUndistortedPoints, undistortedPoints.t(), thresh);
     }
@@ -103,13 +114,17 @@ TEST_F(UndistortPointsTest, stop_criteria)
     TermCriteria criteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 100, maxError);
 
     std::vector<Point2d> pt_undist_vec;
-    undistortPoints(pt_distorted_vec, pt_undist_vec, cameraMatrix, distCoeffs, noArray(), noArray(), criteria);
+    Mat rVec = Mat(Matx31d(0.1, -0.2, 0.2));
+    Mat R;
+    cv::Rodrigues(rVec, R);
+
+    undistortPoints(pt_distorted_vec, pt_undist_vec, cameraMatrix, distCoeffs, R, noArray(), criteria);
 
     std::vector<Point3d> pt_undist_vec_homogeneous;
     pt_undist_vec_homogeneous.emplace_back(pt_undist_vec[0].x, pt_undist_vec[0].y, 1.0 );
 
     std::vector<Point2d> pt_redistorted_vec;
-    projectPoints(pt_undist_vec_homogeneous, Mat::zeros(3,1,CV_64F),
+    projectPoints(pt_undist_vec_homogeneous, -rVec,
                   Mat::zeros(3,1,CV_64F), cameraMatrix, distCoeffs, pt_redistorted_vec);
 
     const double obtainedError = sqrt( pow(pt_distorted.x - pt_redistorted_vec[0].x, 2) + pow(pt_distorted.y - pt_redistorted_vec[0].y, 2) );

From 18ada77d8a24cd1e12ceb220c8530c247cf24bd4 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <76472231+asenyaev@users.noreply.github.com>
Date: Thu, 28 Apr 2022 22:07:07 +0300
Subject: [PATCH 011/178] Merge pull request #21888 from
 asenyaev:asen/workflow_only_windows_3.4

Added workflow for Github Actions to build and test OpenCV on Windows for 3.4 branch

* Added workflow for Github Actions to build and test OpenCV on Windows

* Updated Github Actions for 3.4 branch on Windows using self-hosted runner

* Fixed url for a fork in Windows workflow (3.4 branch)

* opencv_extra fork usage in Github Actions
---
 .github/workflows/PR-3.4-W10.yaml | 166 ++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 .github/workflows/PR-3.4-W10.yaml

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
new file mode 100644
index 0000000000..a7e8be71a2
--- /dev/null
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -0,0 +1,166 @@
+name: PR:3.4 W10
+
+on:
+  pull_request:
+    branches:
+      - 3.4
+
+env:
+  EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=c:\Slave\workspace\binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
+  OPENCV_TEST_DATA_PATH: ${{ github.workspace }}\opencv_extra\testdata
+  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
+  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
+  TARGET_BRANCH_NAME: ${{ github.base_ref }}
+  GTEST_FILTER_STRING: '-Samples.findFile:ExposureCompensate.SimilarityThreshold:DNNTestNetwork.MobileNet_SSD_v2_TensorFlow*:DNNTestNetwork.Inception_v2_SSD_TensorFlow*:Test_TensorFlow_nets.Inception_v2_SSD*:Test_Darknet_nets.YOLOv4_tiny*:videoio/videocapture_acceleration.read/61:Drawing.ttf_text'
+  GIT_CACHE: c:\Slave\git_cache
+
+jobs:
+  BuildAndTest:
+    runs-on: opencv-cn-win
+    defaults:
+      run:
+        shell: cmd
+    steps:
+    - name: PR info
+      run: |
+        echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
+        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
+    - name: Clean
+      run: cd ${{ github.workspace }} && rm -rf *
+    - name: Fetch opencv
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git https://github.com/opencv/opencv.git
+    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        cd ${{ github.workspace }}\opencv
+        git config user.email "opencv.ci"
+        git config user.name "opencv.ci"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+    - name: Fetch opencv_extra
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_extra.git https://github.com/opencv/opencv_extra.git
+    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
+      shell: bash
+      run: |
+        RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$RET" ]]; then
+          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd opencv_extra
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
+    - name: Configure OpenCV
+      run: |
+        mkdir ${{ github.workspace }}\opencv-build && cd ${{ github.workspace }}\opencv-build
+        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
+        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ github.workspace }}\opencv
+    - name: Build OpenCV
+      run: |
+        cd ${{ github.workspace }}\opencv-build
+        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
+        ninja
+    - name: Accuracy:calib3d
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_calib3d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:core
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_core.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:dnn
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_dnn.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:features2d
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_features2d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:flann
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_flann.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:highgui
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_highgui.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:imgcodecs
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgcodecs.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:imgproc
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgproc.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:ml
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_ml.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:objdetect
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_objdetect.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:photo
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_photo.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:shape
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_shape.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:stitching
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_stitching.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:superres
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_superres.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:video
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_video.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:videoio
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videoio.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:videostab
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videostab.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:calib3d
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_calib3d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:core
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_core.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:dnn
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_dnn.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:features2d
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_features2d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:imgcodecs
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgcodecs.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:imgproc
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgproc.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:objdetect
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_objdetect.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:photo
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_photo.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:stitching
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_stitching.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:superres
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_superres.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:video
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_video.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:videoio
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_videoio.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Python3
+      run: |
+        cd ${{ github.workspace }}\opencv\modules\python\test
+        set PYTHONPATH=%PYTHONPATH%;${{ github.workspace }}\opencv-build\python_loader;${{ github.workspace }}\opencv-build\lib\python3
+        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin;${{ github.workspace }}\opencv-build\lib\python3
+        python test.py --repo ..\..\..\ -v
+    - name: Java
+      run: |
+        cd ${{ github.workspace }}\opencv-build
+        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin
+        ${{ github.workspace }}\opencv\modules\ts\misc\run.py . -a -t java
+  BuildContrib:
+    runs-on: opencv-cn-win
+    defaults:
+      run:
+        shell: cmd
+    steps:
+    - name: PR info
+      run: |
+        echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
+        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
+    - name: Clean
+      run: cd ${{ github.workspace }} && rm -rf *
+    - name: Fetch opencv
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git https://github.com/opencv/opencv.git
+    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        cd ${{ github.workspace }}\opencv
+        git config user.email "opencv.ci"
+        git config user.name "opencv.ci"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+    - name: Fetch opencv_contrib
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_contrib.git --depth 1 https://github.com/opencv/opencv_contrib.git
+    - name: Configure OpenCV Contrib
+      run: |
+        mkdir ${{ github.workspace }}\opencv-contrib-build && cd ${{ github.workspace }}\opencv-contrib-build
+        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
+        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=${{ github.workspace }}\opencv_contrib\modules ${{ github.workspace }}\opencv
+    - name: Build OpenCV Contrib
+      run: |
+        cd ${{ github.workspace }}\opencv-contrib-build
+        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
+        ninja

From 64ded50bbf28c0a37a2c0695ae87b0fc08014403 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Fri, 29 Apr 2022 10:17:02 +0800
Subject: [PATCH 012/178] parsing depth2space and space2depth of ONNX importer

---
 modules/dnn/src/onnx/onnx_importer.cpp        | 81 +++++++++++++++++++
 ..._conformance_layer_parser_denylist.inl.hpp |  6 --
 2 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 2a440a1284..00160613d1 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -175,6 +175,7 @@ private:
     void parseSoftMax              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseDetectionOutput      (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseCumSum               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseDepthToSpace         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseSimpleLayers         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
 
     // Domain: com.microsoft
@@ -3183,6 +3184,85 @@ void ONNXImporter::parseCumSum(LayerParams& layerParams, const opencv_onnx::Node
     addLayer(layerParams, node_proto);
 }
 
+void ONNXImporter::parseDepthToSpace(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    // We parse "DepthToSpace" and "SpaceToDepth" in this function.
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    const std::string& layer_type = node_proto.op_type();
+    CV_Assert(layer_type == "DepthToSpace" || layer_type == "SpaceToDepth");
+
+    // Get blocksize
+    CV_Assert(layerParams.has("blocksize"));
+    int blocksize = layerParams.get<int>("blocksize");
+    CV_Assert(blocksize > 0);
+
+    // Get mode, only for "DepthToSpace"
+    std::string modeType = layerParams.get<std::string>("mode", "DCR");
+
+    MatShape inpShape = outShapes[node_proto.input(0)];
+    CV_Assert(inpShape.size() == 4);
+    int N = inpShape[0], C = inpShape[1], H = inpShape[2], W = inpShape[3];
+
+    // Implement DepthToSpace and SpaceToDepth by the Reshape and Permute layer.
+    std::array<int, 6> shape0, perm;
+    std::array<int, 4> shape1;
+
+    if (layer_type == "DepthToSpace")
+    {
+        if (modeType == "DCR")
+        {
+            shape0 = {N, blocksize, blocksize, C/(blocksize * blocksize), H, W};
+            perm = {0, 3, 4, 1, 5, 2};
+            shape1 = {N, C/(blocksize * blocksize), H * blocksize, W * blocksize};
+        }
+        else if (modeType == "CRD")
+        {
+            shape0 = {N, C/(blocksize * blocksize), blocksize, blocksize, H, W};
+            perm = {0, 1, 4, 2, 5, 3};
+            shape1 = {N, C/(blocksize * blocksize), H * blocksize, W * blocksize};
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "The mode of " + modeType + " in " + layer_type + " Layer is not supported");
+    }
+    else // SpaceToDepth
+    {
+        shape0 = {N, C, H/blocksize, blocksize, W/blocksize, blocksize};
+        perm = {0, 3, 5, 1, 2, 4};
+        shape1 = {N, C * blocksize * blocksize, H/blocksize, W/blocksize};
+    }
+
+    // Step1: Reshape
+    LayerParams reshapeLp;
+    reshapeLp.name = layerParams.name + "/reshape";
+    reshapeLp.type = "Reshape";
+    CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+    reshapeLp.set("dim", DictValue::arrayInt(shape0.data(), shape0.size()));
+
+    opencv_onnx::NodeProto protoReshape;
+    protoReshape.add_input(node_proto.input(0));
+    protoReshape.add_output(reshapeLp.name);
+    addLayer(reshapeLp, protoReshape);
+
+    // Step2: Transpose
+    LayerParams permuteLp;
+    permuteLp.name = layerParams.name + "/permute";
+    permuteLp.type = "Permute";
+    CV_Assert(layer_id.find(permuteLp.name) == layer_id.end());
+    permuteLp.set("order", DictValue::arrayInt(perm.data(), perm.size()));
+
+    opencv_onnx::NodeProto protoPermute;
+    protoPermute.add_input(reshapeLp.name);
+    protoPermute.add_output(permuteLp.name);
+    addLayer(permuteLp, protoPermute);
+
+    // Step3: Reshape
+    layerParams.type = "Reshape";
+    layerParams.set("dim", DictValue::arrayInt(shape1.data(), shape1.size()));
+
+    node_proto.set_input(0, permuteLp.name);
+    addLayer(layerParams, node_proto);
+}
+
 void ONNXImporter::parseSimpleLayers(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     for (int j = 0; j < node_proto.input_size(); j++) {
@@ -3672,6 +3752,7 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["SoftMax"] = dispatch["LogSoftmax"] = &ONNXImporter::parseSoftMax;
     dispatch["DetectionOutput"] = &ONNXImporter::parseDetectionOutput;
     dispatch["CumSum"] = &ONNXImporter::parseCumSum;
+    dispatch["SpaceToDepth"] = dispatch["DepthToSpace"] = &ONNXImporter::parseDepthToSpace;
 
     std::vector<std::string> simpleLayers{"Acos", "Acosh", "Asin", "Asinh", "Atan", "Atanh", "Ceil", "Celu", "Cos",
                                           "Cosh", "Dropout", "Erf", "Exp", "Floor", "HardSigmoid", "HardSwish",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index 49db810c7f..1437e5475b 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -96,10 +96,6 @@
 "test_cumsum_2d_axis_0",
 "test_cumsum_2d_axis_1",
 "test_cumsum_2d_negative_axis",
-"test_depthtospace_crd_mode",
-"test_depthtospace_crd_mode_example",
-"test_depthtospace_dcr_mode",
-"test_depthtospace_example",
 "test_dequantizelinear",
 "test_dequantizelinear_axis",
 "test_det_2d",
@@ -490,8 +486,6 @@
 "test_slice_neg_steps",
 "test_slice_negative_axes",
 "test_slice_start_out_of_bounds",
-"test_spacetodepth",
-"test_spacetodepth_example",
 "test_split_variable_parts_1d",
 "test_split_variable_parts_2d",
 "test_split_variable_parts_default_axis",

From ca2ab3387f2ba248b2851c4e33195b422b0b4b94 Mon Sep 17 00:00:00 2001
From: LaurentBerger <laurent.berger@univ-lemans.fr>
Date: Tue, 12 Apr 2022 13:48:33 +0200
Subject: [PATCH 013/178] Try to solve I21853 mouse wheel problem with GTK3

---
 modules/highgui/src/window_gtk.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index d8125a67ae..5583ab358b 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -1831,7 +1831,7 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
 #if defined(GTK_VERSION3_4)
         // NOTE: in current implementation doesn't possible to put into callback function delta_x and delta_y separately
         double delta = (event->scroll.delta_x + event->scroll.delta_y);
-        cv_event   = (event->scroll.delta_y!=0) ? CV_EVENT_MOUSEHWHEEL : CV_EVENT_MOUSEWHEEL;
+        cv_event   = (event->scroll.delta_x==0) ? CV_EVENT_MOUSEWHEEL : CV_EVENT_MOUSEHWHEEL;
 #else
         cv_event = CV_EVENT_MOUSEWHEEL;
 #endif //GTK_VERSION3_4

From 1cdd8510fd9d3957df4fa513a1fbc2df32bf0346 Mon Sep 17 00:00:00 2001
From: Yulv-git <yulvchi@qq.com>
Date: Sat, 30 Apr 2022 13:29:25 +0800
Subject: [PATCH 014/178] Fix some typos in doc.

---
 3rdparty/openvx/README.md                                     | 2 +-
 .../js_morphological_ops/js_morphological_ops.markdown        | 2 +-
 doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown        | 2 +-
 .../py_feature_homography/py_feature_homography.markdown      | 4 ++--
 .../py_morphological_ops/py_morphological_ops.markdown        | 2 +-
 .../imgproc/morph_lines_detection/morph_lines_detection.md    | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/3rdparty/openvx/README.md b/3rdparty/openvx/README.md
index 2f869a42bf..339b5072cc 100644
--- a/3rdparty/openvx/README.md
+++ b/3rdparty/openvx/README.md
@@ -77,7 +77,7 @@ E.g. external ref-counting is implemented for 1.0 version and native OpenVX one
 
 Also there are some **C++ 11** features are used (e.g. rvalue ref-s) when their availability is detected at ***compile time***.
 
-C++ exceptions are used for errors indication instead of return codes. There are two types of exceptions are defined:  `RuntimeError` is thrown when OpenVX C call returned unsuccessful result and `WrapperError` is thrown when a problem is occured in the wrappers code. Both exception calsses are derived from `std::exception` (actually from its inheritants).
+C++ exceptions are used for errors indication instead of return codes. There are two types of exceptions are defined:  `RuntimeError` is thrown when OpenVX C call returned unsuccessful result and `WrapperError` is thrown when a problem is occurred in the wrappers code. Both exception calsses are derived from `std::exception` (actually from its inheritants).
 
 The so called **OpenVX objects** (e.g. `vx_image`) are represented as C++ classes in wrappers.
 All these classes use automatic ref-counting that allows development of exception-safe code.
diff --git a/doc/js_tutorials/js_imgproc/js_morphological_ops/js_morphological_ops.markdown b/doc/js_tutorials/js_imgproc/js_morphological_ops/js_morphological_ops.markdown
index b5e10e0e67..9ffa218bb2 100644
--- a/doc/js_tutorials/js_imgproc/js_morphological_ops/js_morphological_ops.markdown
+++ b/doc/js_tutorials/js_imgproc/js_morphological_ops/js_morphological_ops.markdown
@@ -52,7 +52,7 @@ Try it
 
 ### 2. Dilation
 
-It is just opposite of erosion. Here, a pixel element is '1' if atleast one pixel under the kernel
+It is just opposite of erosion. Here, a pixel element is '1' if at least one pixel under the kernel
 is '1'. So it increases the white region in the image or size of foreground object increases.
 Normally, in cases like noise removal, erosion is followed by dilation. Because, erosion removes
 white noises, but it also shrinks our object. So we dilate it. Since noise is gone, they won't come
diff --git a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
index 2fe17c5db6..17ced36b5a 100644
--- a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
+++ b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
@@ -223,7 +223,7 @@ Before the example, is worth consider first how files are handled in emscripten
 
 These C++ sources use standard APIs to access the filesystem and the implementation often ends up in system calls that read a file in the hard drive. Since JavaScript applications in the browser don't have access to the local filesystem, [emscripten emulates a standard filesystem](https://emscripten.org/docs/api_reference/Filesystem-API.html) so compiled C++ code works out of the box.
 
-In the browser, this filesystem is emulated in memory while in Node.js there's also the possibility of using the local filesystem directly. This is often preferable since there's no need of copy file's content in memory. This section is explains how to do do just that, this is, configuring emscripten so files are accessed directly from our local filesystem and relative paths match files relative to the current local directory as expected.
+In the browser, this filesystem is emulated in memory while in Node.js there's also the possibility of using the local filesystem directly. This is often preferable since there's no need of copy file's content in memory. This section explains how to do just that, this is, configuring emscripten so files are accessed directly from our local filesystem and relative paths match files relative to the current local directory as expected.
 
 ### The example ###
 
diff --git a/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown b/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
index f8836b095b..6abac2c57b 100644
--- a/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
+++ b/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
@@ -18,7 +18,7 @@ is sufficient to find the object exactly on the trainImage.
 
 For that, we can use a function from calib3d module, ie **cv.findHomography()**. If we pass the set
 of points from both the images, it will find the perspective transformation of that object. Then we
-can use **cv.perspectiveTransform()** to find the object. It needs atleast four correct points to
+can use **cv.perspectiveTransform()** to find the object. It needs at least four correct points to
 find the transformation.
 
 We have seen that there can be some possible errors while matching which may affect the result. To
@@ -64,7 +64,7 @@ for m,n in matches:
     if m.distance < 0.7*n.distance:
         good.append(m)
 @endcode
-Now we set a condition that atleast 10 matches (defined by MIN_MATCH_COUNT) are to be there to
+Now we set a condition that at least 10 matches (defined by MIN_MATCH_COUNT) are to be there to
 find the object. Otherwise simply show a message saying not enough matches are present.
 
 If enough matches are found, we extract the locations of matched keypoints in both the images. They
diff --git a/doc/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.markdown b/doc/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.markdown
index 35e716d8e9..84a62d14cd 100644
--- a/doc/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.markdown
+++ b/doc/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.markdown
@@ -48,7 +48,7 @@ Result:
 
 ### 2. Dilation
 
-It is just opposite of erosion. Here, a pixel element is '1' if atleast one pixel under the kernel
+It is just opposite of erosion. Here, a pixel element is '1' if at least one pixel under the kernel
 is '1'. So it increases the white region in the image or size of foreground object increases.
 Normally, in cases like noise removal, erosion is followed by dilation. Because, erosion removes
 white noises, but it also shrinks our object. So we dilate it. Since noise is gone, they won't come
diff --git a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
index ccce36270c..39a76a74ce 100644
--- a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
+++ b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
@@ -123,7 +123,7 @@ Get image from [here](https://raw.githubusercontent.com/opencv/opencv/3.4/doc/tu
 
 #### Output images
 
-Now we are ready to apply morphological operations in order to extract the horizontal and vertical lines and as a consequence to separate the the music notes from the music sheet, but first let's initialize the output images that we will use for that reason:
+Now we are ready to apply morphological operations in order to extract the horizontal and vertical lines and as a consequence to separate the music notes from the music sheet, but first let's initialize the output images that we will use for that reason:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgProc/morph_lines_detection/Morphology_3.cpp init

From 15ac54d5d608bce2c6cda3a0a403449d6bd3a9bc Mon Sep 17 00:00:00 2001
From: Yulv-git <yulvchi@qq.com>
Date: Sat, 30 Apr 2022 13:40:07 +0800
Subject: [PATCH 015/178] Fix some typos in modules/.

---
 modules/core/include/opencv2/core/mat.hpp           | 2 +-
 modules/cudafilters/include/opencv2/cudafilters.hpp | 2 +-
 modules/cudafilters/src/cuda/median_filter.cu       | 2 +-
 modules/dnn/src/darknet/darknet_io.cpp              | 2 +-
 modules/dnn/src/onnx/opencv-onnx.proto              | 2 +-
 modules/imgproc/include/opencv2/imgproc.hpp         | 2 +-
 modules/imgproc/src/opencl/medianFilter.cl          | 2 +-
 modules/imgproc/test/test_convhull.cpp              | 2 +-
 modules/ml/src/lr.cpp                               | 2 +-
 modules/videoio/src/cap_gstreamer.cpp               | 2 +-
 modules/videoio/src/cap_intelperc.cpp               | 8 ++++----
 modules/videoio/src/cap_intelperc.hpp               | 2 +-
 modules/videoio/src/cap_libv4l.cpp                  | 2 +-
 modules/videoio/test/test_video_io.cpp              | 2 +-
 14 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 919fa0b6f9..289ba7f7c5 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -2290,7 +2290,7 @@ public:
     const_iterator begin() const;
     const_iterator end() const;
 
-    //! template methods for for operation over all matrix elements.
+    //! template methods for operation over all matrix elements.
     // the operations take care of skipping gaps in the end of rows (if any)
     template<typename Functor> void forEach(const Functor& operation);
     template<typename Functor> void forEach(const Functor& operation) const;
diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
index 1e25e5602d..8cd109f775 100644
--- a/modules/cudafilters/include/opencv2/cudafilters.hpp
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -318,7 +318,7 @@ CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize
 
 /** @brief Performs median filtering for each point of the source image.
 
-@param srcType type of of source image. Only CV_8UC1 images are supported for now.
+@param srcType type of source image. Only CV_8UC1 images are supported for now.
 @param windowSize Size of the kernerl used for the filtering. Uses a (windowSize x windowSize) filter.
 @param partition Specifies the parallel granularity of the workload. This parameter should be used GPU experts when optimizing performance.
 
diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu
index dd43a365c0..0b9f3c718e 100644
--- a/modules/cudafilters/src/cuda/median_filter.cu
+++ b/modules/cudafilters/src/cuda/median_filter.cu
@@ -235,7 +235,7 @@ namespace cv { namespace cuda { namespace device
         }
         __syncthreads();
 
-        // For all remaining rows in the median filter, add the values to the the histogram
+        // For all remaining rows in the median filter, add the values to the histogram
         for (int j=threadIdx.x; j<cols; j+=blockDim.x){
             for(int i=initStartRow; i<initStopRow; i++){
                     int pos=::min(i,rows-1);
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 11aad453e3..520f3c94be 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -791,7 +791,7 @@ namespace cv {
                             if (layers_vec.size() > 1)
                             {
                                 // layer ids in layers_vec - inputs of Slice layers
-                                // after adding offset to layers_vec: layer ids - ouputs of Slice layers
+                                // after adding offset to layers_vec: layer ids - outputs of Slice layers
                                 for (size_t k = 0; k < layers_vec.size(); ++k)
                                     layers_vec[k] += layers_vec.size();
 
diff --git a/modules/dnn/src/onnx/opencv-onnx.proto b/modules/dnn/src/onnx/opencv-onnx.proto
index b24220adb9..8dd69cb2d9 100644
--- a/modules/dnn/src/onnx/opencv-onnx.proto
+++ b/modules/dnn/src/onnx/opencv-onnx.proto
@@ -61,7 +61,7 @@ enum Version {
   // The version field is always serialized and we will use it to store the
   // version that the  graph is generated from. This helps us set up version
   // control.
-  // For the IR, we are using simple numbers starting with with 0x00000001,
+  // For the IR, we are using simple numbers starting with 0x00000001,
   // which was the version we published on Oct 10, 2017.
   IR_VERSION_2017_10_10 = 0x0000000000000001;
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 9425ba88d5..5ba36040c9 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1537,7 +1537,7 @@ CV_EXPORTS_W void boxFilter( InputArray src, OutputArray dst, int ddepth,
 For every pixel \f$ (x, y) \f$ in the source image, the function calculates the sum of squares of those neighboring
 pixel values which overlap the filter placed over the pixel \f$ (x, y) \f$.
 
-The unnormalized square box filter can be useful in computing local image statistics such as the the local
+The unnormalized square box filter can be useful in computing local image statistics such as the local
 variance and standard deviation around the neighborhood of a pixel.
 
 @param src input image
diff --git a/modules/imgproc/src/opencl/medianFilter.cl b/modules/imgproc/src/opencl/medianFilter.cl
index f9a6c9e8f4..b517a38059 100644
--- a/modules/imgproc/src/opencl/medianFilter.cl
+++ b/modules/imgproc/src/opencl/medianFilter.cl
@@ -43,7 +43,7 @@
 
 #ifdef USE_4OPT
 
-//Utility macros for for 1,2,4 channel images:
+//Utility macros for 1,2,4 channel images:
 //  - LOAD4/STORE4 - load/store 4-pixel groups from/to global memory
 
 //  - SHUFFLE4_3/SHUFFLE4_5 - rearrange scattered border/central pixels into regular 4-pixel variables
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index aa553e5efe..e5f6bd8e63 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -1015,7 +1015,7 @@ int CV_MinCircleTest::validate_test_results( int test_case_idx )
     if( point_count >= 2 && (j < 2 || (j == 2 && cvTsDist(v[0],v[1]) < (radius-1)*2/eps)) )
     {
         ts->printf( cvtest::TS::LOG,
-            "There should be at at least 3 points near the circle boundary or 2 points on the diameter\n" );
+            "There should be at least 3 points near the circle boundary or 2 points on the diameter\n" );
         code = cvtest::TS::FAIL_BAD_ACCURACY;
         goto _exit_;
     }
diff --git a/modules/ml/src/lr.cpp b/modules/ml/src/lr.cpp
index ed4fb4c720..b43e104045 100644
--- a/modules/ml/src/lr.cpp
+++ b/modules/ml/src/lr.cpp
@@ -126,7 +126,7 @@ bool LogisticRegressionImpl::train(const Ptr<TrainData>& trainData, int)
     int num_classes = (int) this->forward_mapper.size();
     if(num_classes < 2)
     {
-        CV_Error( CV_StsBadArg, "data should have atleast 2 classes" );
+        CV_Error( CV_StsBadArg, "data should have at least 2 classes" );
     }
 
     // add a column of ones to the data (bias/intercept term)
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index dd6387d892..34bc98eaba 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -334,7 +334,7 @@ GStreamerCapture::~GStreamerCapture()
 /*!
  * \brief CvCapture_GStreamer::grabFrame
  * \return
- * Grabs a sample from the pipeline, awaiting consumation by retreiveFrame.
+ * Grabs a sample from the pipeline, awaiting consumation by retrieveFrame.
  * The pipeline is started if it was not running yet
  */
 bool GStreamerCapture::grabFrame()
diff --git a/modules/videoio/src/cap_intelperc.cpp b/modules/videoio/src/cap_intelperc.cpp
index 093b1fd235..63e43ee0da 100644
--- a/modules/videoio/src/cap_intelperc.cpp
+++ b/modules/videoio/src/cap_intelperc.cpp
@@ -507,21 +507,21 @@ bool IntelPerCStreamDepth::setProperty(int propIdx, double propVal)
 }
 bool IntelPerCStreamDepth::retrieveDepthAsOutputArray(cv::OutputArray image)
 {
-    return retriveFrame(CV_16SC1, 0, image);
+    return retrieveFrame(CV_16SC1, 0, image);
 }
 bool IntelPerCStreamDepth::retrieveIRAsOutputArray(cv::OutputArray image)
 {
-    return retriveFrame(CV_16SC1, 1, image);
+    return retrieveFrame(CV_16SC1, 1, image);
 }
 bool IntelPerCStreamDepth::retrieveUVAsOutputArray(cv::OutputArray image)
 {
-    return retriveFrame(CV_32FC2, 2, image);
+    return retrieveFrame(CV_32FC2, 2, image);
 }
 bool IntelPerCStreamDepth::validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo)
 {
     return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format);
 }
-bool IntelPerCStreamDepth::retriveFrame(int type, int planeIdx, cv::OutputArray frame)
+bool IntelPerCStreamDepth::retrieveFrame(int type, int planeIdx, cv::OutputArray frame)
 {
     if (!m_pxcImage.IsValid())
         return false;
diff --git a/modules/videoio/src/cap_intelperc.hpp b/modules/videoio/src/cap_intelperc.hpp
index 209cd4736a..66ba3e21ca 100644
--- a/modules/videoio/src/cap_intelperc.hpp
+++ b/modules/videoio/src/cap_intelperc.hpp
@@ -85,7 +85,7 @@ public:
 protected:
     virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo);
 protected:
-    bool retriveFrame(int type, int planeIdx, OutputArray frame);
+    bool retrieveFrame(int type, int planeIdx, OutputArray frame);
 };
 
 class VideoCapture_IntelPerC : public IVideoCapture
diff --git a/modules/videoio/src/cap_libv4l.cpp b/modules/videoio/src/cap_libv4l.cpp
index 6fd0f757a8..1c1129ceaa 100644
--- a/modules/videoio/src/cap_libv4l.cpp
+++ b/modules/videoio/src/cap_libv4l.cpp
@@ -1858,7 +1858,7 @@ static int icvSetPropertyCAM_V4L(CvCaptureCAM_V4L* capture, int property_id, dou
         retval = icvSetControl(capture, property_id, value);
     }
 
-    /* return the the status */
+    /* return the status */
     return retval;
 }
 
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index 44947f02f4..e82ef9fd48 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -261,7 +261,7 @@ public:
             if (cvtest::debugLevel > 0)
                 std::cout << "i = " << i << ": timestamp = " << timestamp << std::endl;
             const double frame_period = 1000.f/bunny_param.getFps();
-            // NOTE: eps == frame_period, because videoCapture returns frame begining timestamp or frame end
+            // NOTE: eps == frame_period, because videoCapture returns frame beginning timestamp or frame end
             // timestamp depending on codec and back-end. So the first frame has timestamp 0 or frame_period.
             EXPECT_NEAR(timestamp, i*frame_period, frame_period) << "i=" << i;
         }

From 82ae9ef54181b64e1f7400001fda3ae72e4c720a Mon Sep 17 00:00:00 2001
From: huangziqing <270704881@qq.com>
Date: Mon, 2 May 2022 00:54:17 +0800
Subject: [PATCH 016/178] Wrap gpuMat::release to Python

---
 modules/core/include/opencv2/core/cuda.hpp | 2 +-
 modules/python/test/test_cuda.py           | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 09220cd6b2..1ebea07c0d 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -155,7 +155,7 @@ public:
     CV_WRAP void create(Size size, int type);
 
     //! decreases reference counter, deallocate the data when reference counter reaches 0
-    void release();
+    CV_WRAP void release();
 
     //! swaps with other smart pointer
     CV_WRAP void swap(GpuMat& mat);
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index a047a8b2f4..a5f3fae847 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -55,5 +55,14 @@ class cuda_test(NewOpenCVTests):
         self.assertEqual(cuMat.size(), (1024, 1024))
         self.assertEqual(cuMat.type(), cv.CV_8UC3)
 
+    def test_cuda_release(self):
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
+        cuMat = cv.cuda_GpuMat()
+        cuMat.upload(npMat)
+        cuMat.release()
+        self.assertTrue(cuMat.cudaPtr() == 0)
+        self.assertTrue(cuMat.step == 0)
+        self.assertTrue(cuMat.size() == (0, 0))
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()

From 9e1e96011657ba61699dd27d4ee06e544623f0c6 Mon Sep 17 00:00:00 2001
From: Christine Poerschke <6458642+cpoerschke@users.noreply.github.com>
Date: Sun, 1 May 2022 19:59:55 +0100
Subject: [PATCH 017/178] imgproc: fix two typos (imput, magnutude)

---
 modules/imgproc/include/opencv2/imgproc/segmentation.hpp | 2 +-
 modules/imgproc/src/intelligent_scissors.cpp             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc/segmentation.hpp b/modules/imgproc/include/opencv2/imgproc/segmentation.hpp
index 26882f444e..c40d5011ee 100644
--- a/modules/imgproc/include/opencv2/imgproc/segmentation.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/segmentation.hpp
@@ -92,7 +92,7 @@ public:
     CV_WRAP
     IntelligentScissorsMB& applyImage(InputArray image);
 
-    /** @brief Specify custom features of imput image
+    /** @brief Specify custom features of input image
      *
      * Customized advanced variant of applyImage() call.
      *
diff --git a/modules/imgproc/src/intelligent_scissors.cpp b/modules/imgproc/src/intelligent_scissors.cpp
index 38acfd79e3..1b7e3dd163 100644
--- a/modules/imgproc/src/intelligent_scissors.cpp
+++ b/modules/imgproc/src/intelligent_scissors.cpp
@@ -429,13 +429,13 @@ struct IntelligentScissorsMB::Impl
         gradient_direction.create(src_size);
         for (int y = 0; y < src_size.height; y++)
         {
-            const float* magnutude_row = image_magnitude_.ptr<float>(y);
+            const float* magnitude_row = image_magnitude_.ptr<float>(y);
             const float* Ix_row = Ix_.ptr<float>(y);
             const float* Iy_row = Iy_.ptr<float>(y);
             Point2f* gradient_direction_row = gradient_direction.ptr<Point2f>(y);
             for (int x = 0; x < src_size.width; x++)
             {
-                const float m = magnutude_row[x];
+                const float m = magnitude_row[x];
                 if (m > FLT_EPSILON)
                 {
                     float m_inv = 1.0f / m;

From 667e5e4f890f24a638ca04f638967f1083468bb0 Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@users.noreply.github.com>
Date: Thu, 5 May 2022 09:59:29 +0200
Subject: [PATCH 018/178] Merge pull request #21943 from vrabaud:3.4_proc

* Fix compilation with non glibc.

_SC_NPROCESSORS_ONLN is non standard as defined on https://man7.org/linux/man-pages/man3/sysconf.3.html
It seems to only be on glibc, cf https://www.gnu.org/software/libc/manual/html_node/Processor-Resources.html

* Fix to defined(_SC_NPROCESSORS_ONLN)
---
 modules/core/src/parallel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 95df5454c2..a4a7baf0f6 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -942,7 +942,7 @@ int getNumberOfCPUs_()
 
 #endif
 
-#if !defined(_WIN32) && !defined(__APPLE__)
+#if !defined(_WIN32) && !defined(__APPLE__) && defined(_SC_NPROCESSORS_ONLN)
 
     static unsigned cpu_count_sysconf = (unsigned)sysconf( _SC_NPROCESSORS_ONLN );
     ncpus = minNonZero(ncpus, cpu_count_sysconf);

From c8228e5789510ec26378eceb17311dd7bddf0b33 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <76472231+asenyaev@users.noreply.github.com>
Date: Thu, 5 May 2022 11:09:20 +0300
Subject: [PATCH 019/178] Merge pull request #21896 from
 asenyaev:asen/worflow_windows_4.x

Added workflow for Github Actions to build and test OpenCV on Windows for 4.x branch
---
 .github/workflows/PR-4.x-W10.yaml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/PR-4.x-W10.yaml b/.github/workflows/PR-4.x-W10.yaml
index 850f7c5493..95fb962f97 100644
--- a/.github/workflows/PR-4.x-W10.yaml
+++ b/.github/workflows/PR-4.x-W10.yaml
@@ -1,7 +1,9 @@
 name: PR:4.x W10
 
-# TODO: enable pipeline after 4.x update
-on: workflow_dispatch
+on:
+  pull_request:
+    branches:
+      - 4.x
 
 env:
   EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=c:\Slave\workspace\binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
@@ -10,7 +12,7 @@ env:
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  GTEST_FILTER_STRING: '-Samples.findFile:ExposureCompensate.SimilarityThreshold:DNNTestNetwork.MobileNet_SSD_v2_TensorFlow*:DNNTestNetwork.Inception_v2_SSD_TensorFlow*:Test_TensorFlow_nets.Inception_v2_SSD*:Test_Darknet_nets.YOLOv4_tiny*:videoio/videocapture_acceleration.read/61:Drawing.ttf_text'
+  GTEST_FILTER_STRING: '-Samples.findFile:ExposureCompensate.SimilarityThreshold:DNNTestNetwork.MobileNet_SSD_v2_TensorFlow*:DNNTestNetwork.Inception_v2_SSD_TensorFlow*:Test_TensorFlow_nets.Inception_v2_SSD*:Test_Darknet_nets.YOLOv4_tiny*:videoio/videocapture_acceleration.read/61:videoio/videocapture_acceleration.read/122:videoio/videocapture_acceleration.read/126:Drawing.ttf_text'
   GIT_CACHE: c:\Slave\git_cache
 
 jobs:
@@ -70,6 +72,8 @@ jobs:
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_features2d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:flann
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_flann.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Accuracy:gapi
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_gapi.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:highgui
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_highgui.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:imgcodecs
@@ -82,18 +86,12 @@ jobs:
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_objdetect.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:photo
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_photo.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:shape
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_shape.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:stitching
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_stitching.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:superres
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_superres.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:video
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_video.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Accuracy:videoio
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videoio.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:videostab
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videostab.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:calib3d
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_calib3d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:core
@@ -102,6 +100,8 @@ jobs:
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_dnn.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:features2d
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_features2d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
+    - name: Performance:gapi
+      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_gapi.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:imgcodecs
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgcodecs.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:imgproc
@@ -112,8 +112,6 @@ jobs:
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_photo.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:stitching
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_stitching.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:superres
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_superres.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:video
       run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_video.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
     - name: Performance:videoio
@@ -129,6 +127,8 @@ jobs:
         cd ${{ github.workspace }}\opencv-build
         set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin
         ${{ github.workspace }}\opencv\modules\ts\misc\run.py . -a -t java
+
+
   BuildContrib:
     runs-on: opencv-cn-win
     defaults:

From e4ed2d2e42d33cfcfa51a77ad696b5f8b8f44cef Mon Sep 17 00:00:00 2001
From: hellodoge <45069008+hellodoge@users.noreply.github.com>
Date: Mon, 9 May 2022 15:52:43 +0300
Subject: [PATCH 020/178] fix JpegEncoder::write

3rdparty/libjpeg-turbo/src/jpeglib.h:
`* NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.`
---
 modules/imgcodecs/src/grfmt_jpeg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index 758ac512e8..d9e056f1a8 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -602,9 +602,9 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
     JpegErrorMgr jerr;
     JpegDestination dest;
 
-    jpeg_create_compress(&cinfo);
     cinfo.err = jpeg_std_error(&jerr.pub);
     jerr.pub.error_exit = error_exit;
+    jpeg_create_compress(&cinfo);
 
     if( !m_buf )
     {

From dea08151994449296a3aac7074703df6779623aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Pol=C3=A1k?= <darkyenus+gh@gmail.com>
Date: Fri, 6 May 2022 14:41:52 +0200
Subject: [PATCH 021/178] Small imencode documentation improvement

It was previously not clear that the extension should include the period, [and I am not the only person that encountered this problem](https://stackoverflow.com/questions/4254460/opencv-could-not-find-encoder-for-the-specified-extension).

I assume that this is true for all supported extensions, but I haven't checked the source.
---
 modules/imgcodecs/include/opencv2/imgcodecs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 5dbf348463..ba63e9934b 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -262,7 +262,7 @@ CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);
 The function imencode compresses the image and stores it in the memory buffer that is resized to fit the
 result. See cv::imwrite for the list of supported formats and flags description.
 
-@param ext File extension that defines the output format.
+@param ext File extension that defines the output format. Must include a leading period.
 @param img Image to be written.
 @param buf Output buffer resized to fit the compressed image.
 @param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.

From 07abb6240ee60053581d529316ac669fcd42f2a3 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Wed, 11 May 2022 17:53:40 +0300
Subject: [PATCH 022/178] GTest filtered list

---
 .github/workflows/PR-3.4-W10.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index a7e8be71a2..d965982f5a 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -12,7 +12,7 @@ env:
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  GTEST_FILTER_STRING: '-Samples.findFile:ExposureCompensate.SimilarityThreshold:DNNTestNetwork.MobileNet_SSD_v2_TensorFlow*:DNNTestNetwork.Inception_v2_SSD_TensorFlow*:Test_TensorFlow_nets.Inception_v2_SSD*:Test_Darknet_nets.YOLOv4_tiny*:videoio/videocapture_acceleration.read/61:Drawing.ttf_text'
+  GTEST_FILTER_STRING: '-Samples.findFile'
   GIT_CACHE: c:\Slave\git_cache
 
 jobs:

From 4502003e61ac66b770fb0f905c64d07b2b2526b5 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Wed, 11 May 2022 18:08:44 +0300
Subject: [PATCH 023/178] GTest filtered list for 4.x

---
 .github/workflows/PR-4.x-W10.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/PR-4.x-W10.yaml b/.github/workflows/PR-4.x-W10.yaml
index 95fb962f97..0e313f32e5 100644
--- a/.github/workflows/PR-4.x-W10.yaml
+++ b/.github/workflows/PR-4.x-W10.yaml
@@ -12,7 +12,7 @@ env:
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  GTEST_FILTER_STRING: '-Samples.findFile:ExposureCompensate.SimilarityThreshold:DNNTestNetwork.MobileNet_SSD_v2_TensorFlow*:DNNTestNetwork.Inception_v2_SSD_TensorFlow*:Test_TensorFlow_nets.Inception_v2_SSD*:Test_Darknet_nets.YOLOv4_tiny*:videoio/videocapture_acceleration.read/61:videoio/videocapture_acceleration.read/122:videoio/videocapture_acceleration.read/126:Drawing.ttf_text'
+  GTEST_FILTER_STRING: '-Samples.findFile:videoio/videocapture_acceleration.read/122:videoio/videocapture_acceleration.read/126'
   GIT_CACHE: c:\Slave\git_cache
 
 jobs:

From 646025589b73bdf4b55da1604a093c9ab52b616d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Joona=20Heikkil=C3=A4?=
 <21111572+cxcorp@users.noreply.github.com>
Date: Wed, 11 May 2022 23:15:34 +0300
Subject: [PATCH 024/178] Fix global variable assignment in JS test suite

In test_imgproc.js, the test_filter suite's last test assigns a variable
to `size` without declaring it with `let`, polluting the global scope.
This commit adds `let` to the statement, so that the variable is scoped
to the test block.
---
 modules/js/test/test_imgproc.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/js/test/test_imgproc.js b/modules/js/test/test_imgproc.js
index 9ba5cd4e38..e643388412 100644
--- a/modules/js/test/test_imgproc.js
+++ b/modules/js/test/test_imgproc.js
@@ -948,7 +948,7 @@ QUnit.test('test_filter', function(assert) {
 
         cv.rotate(src, dst, cv.ROTATE_90_CLOCKWISE);
 
-        size = dst.size();
+        let size = dst.size();
         assert.equal(size.height, 2, "ROTATE_HEIGHT");
         assert.equal(size.width, 3, "ROTATE_WIGTH");
 

From eff5605be53bc9656e0155583a5c4d12191bb9db Mon Sep 17 00:00:00 2001
From: Sergey <35331225+SergeyIvanov87@users.noreply.github.com>
Date: Thu, 12 May 2022 11:42:38 +0300
Subject: [PATCH 025/178] Merge pull request #21883 from
 SergeyIvanov87:gapi_vpl_linux

G-API: VPL Source turn on Linux CPU version

* Turn on linux compilation

* Apply comments

* Change new files headline

* Add license header
---
 modules/gapi/CMakeLists.txt                   |  21 ++-
 .../gapi/streaming/onevpl/accel_types.hpp     |   1 +
 .../gapi_streaming_source_perf_tests.cpp      |  39 ++++-
 .../onevpl/accelerators/accel_policy_cpu.cpp  |  38 +++--
 .../onevpl/accelerators/accel_policy_dx11.cpp |  53 +++++-
 .../onevpl/accelerators/accel_policy_dx11.hpp |  14 +-
 .../accelerators/accel_policy_va_api.cpp      | 150 +++++++++++++++++
 .../accelerators/accel_policy_va_api.hpp      |  62 +++++++
 .../surface/base_frame_adapter.cpp            |   2 +-
 .../surface/base_frame_adapter.hpp            |   2 +-
 .../surface/dx11_frame_adapter.cpp            |   4 +
 .../onevpl/accelerators/surface/surface.cpp   |   9 +-
 .../accelerators/surface/surface_pool.cpp     |   1 +
 .../onevpl/cfg_param_device_selector.cpp      |  59 ++++---
 .../streaming/onevpl/cfg_params_parser.cpp    |  14 +-
 .../demux/async_mfp_demux_data_provider.cpp   |   3 +-
 .../demux/async_mfp_demux_data_provider.hpp   |   2 +-
 .../onevpl/device_selector_interface.cpp      |   2 +
 .../engine/decode/decode_engine_legacy.cpp    |   4 +-
 .../onevpl/engine/decode/decode_session.hpp   |   3 +
 .../onevpl/engine/engine_session.hpp          |   3 +-
 .../onevpl/engine/preproc/preproc_engine.cpp  |   7 +-
 .../engine/preproc/vpp_preproc_defines.hpp    |   6 +
 .../onevpl/engine/processing_engine_base.hpp  |   1 +
 .../transcode/transcode_engine_legacy.cpp     |   5 +-
 .../engine/transcode/transcode_session.hpp    |   2 +-
 .../src/streaming/onevpl/onevpl_export.hpp    |   2 +
 .../gapi/src/streaming/onevpl/source_priv.cpp |   7 +
 modules/gapi/src/streaming/onevpl/utils.cpp   |   4 +-
 .../test/streaming/gapi_streaming_tests.cpp   |   4 +
 .../gapi_streaming_vpl_core_test.cpp          | 142 +++++++++++++---
 .../gapi_streaming_vpl_data_provider.cpp      |   2 +
 .../gapi_streaming_vpl_device_selector.cpp    |  26 +--
 .../gapi_streaming_vpp_preproc_test.cpp       | 152 ++++++++++--------
 34 files changed, 662 insertions(+), 184 deletions(-)
 create mode 100644 modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
 create mode 100644 modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp

diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 579bcfc1be..9b97a1b92a 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -133,7 +133,7 @@ set(gapi_srcs
     src/backends/fluid/gfluidimgproc.cpp
     src/backends/fluid/gfluidimgproc_func.dispatch.cpp
     src/backends/fluid/gfluidcore.cpp
-	  src/backends/fluid/gfluidcore_func.dispatch.cpp
+    src/backends/fluid/gfluidcore_func.dispatch.cpp
 
     # OAK Backend (optional)
     src/backends/oak/goak.cpp
@@ -194,6 +194,7 @@ set(gapi_srcs
     src/streaming/onevpl/accelerators/utils/shared_lock.cpp
     src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
     src/streaming/onevpl/accelerators/accel_policy_dx11.cpp
+    src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
     src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp
     src/streaming/onevpl/engine/engine_session.cpp
     src/streaming/onevpl/engine/processing_engine_base.cpp
@@ -310,12 +311,30 @@ if(HAVE_GAPI_ONEVPL)
         ocv_target_include_directories(opencv_test_gapi SYSTEM PRIVATE ${OPENCL_INCLUDE_DIRS})
     endif()
   endif()
+
   ocv_target_compile_definitions(${the_module} PRIVATE -DHAVE_ONEVPL)
   ocv_target_link_libraries(${the_module} PRIVATE ${VPL_IMPORTED_TARGETS})
 
   if(HAVE_D3D11 AND HAVE_OPENCL)
     ocv_target_include_directories(${the_module} SYSTEM PRIVATE ${OPENCL_INCLUDE_DIRS})
   endif()
+
+  if(UNIX)
+    find_package(PkgConfig)
+    if(PkgConfig_FOUND)
+      pkg_check_modules(PKG_LIBVA libva>=1.2 libva-drm>=1.2)
+      if(PKG_LIBVA_FOUND)
+        set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+        set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+        find_package(Threads REQUIRED)
+      else()
+        message(FATAL_ERROR "libva not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
+      endif()
+    else()
+      message(FATAL_ERROR "PkgConfig not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
+    endif()
+    ocv_target_link_libraries(${the_module} PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
+  endif()
 endif()
 
 ocv_option(OPENCV_GAPI_GSTREAMER "Build G-API with GStreamer support" HAVE_GSTREAMER)
diff --git a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
index 421b592aae..c53b1b31db 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
@@ -20,6 +20,7 @@ namespace onevpl {
 enum class AccelType: uint8_t {
     HOST,
     DX11,
+    VAAPI,
 
     LAST_VALUE = std::numeric_limits<uint8_t>::max()
 };
diff --git a/modules/gapi/perf/streaming/gapi_streaming_source_perf_tests.cpp b/modules/gapi/perf/streaming/gapi_streaming_source_perf_tests.cpp
index eacdef6eba..8021eed3cf 100644
--- a/modules/gapi/perf/streaming/gapi_streaming_source_perf_tests.cpp
+++ b/modules/gapi/perf/streaming/gapi_streaming_source_perf_tests.cpp
@@ -17,6 +17,7 @@
 #include "streaming/onevpl/cfg_param_device_selector.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_dx11.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
 
 namespace opencv_test
 {
@@ -89,6 +90,7 @@ PERF_TEST_P_(VideoCapSourcePerf_Test, TestPerformance)
     SANITY_CHECK_NOTHING();
 }
 
+#ifdef __WIN32__
 INSTANTIATE_TEST_CASE_P(Streaming, OneVPLSourcePerf_Test,
                         Values(source_description_t(files[0], codec[0], ""),
                                source_description_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_D3D11"),
@@ -96,6 +98,11 @@ INSTANTIATE_TEST_CASE_P(Streaming, OneVPLSourcePerf_Test,
                                source_description_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_D3D11"),
                                source_description_t(files[2], codec[2], ""),
                                source_description_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11")));
+#elif __linux__
+INSTANTIATE_TEST_CASE_P(Streaming, OneVPLSourcePerf_Test,
+                        Values(source_description_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI"),
+                               source_description_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI")));
+#endif
 
 INSTANTIATE_TEST_CASE_P(Streaming, VideoCapSourcePerf_Test,
                         Values(files[0],
@@ -152,6 +159,8 @@ static pp_out_param_t full_hd = pp_out_param_t {cv::MediaFormat::NV12,
 static pp_out_param_t cif = pp_out_param_t {cv::MediaFormat::NV12,
                                             {352, 288}};
 
+
+#ifdef __WIN32__
 INSTANTIATE_TEST_CASE_P(Streaming_Source_PP, OneVPLSourcePerf_PP_Test,
                         Values(source_description_preproc_t(files[0], codec[0], "", full_hd),
                                source_description_preproc_t(files[0], codec[0], "", cif),
@@ -165,6 +174,13 @@ INSTANTIATE_TEST_CASE_P(Streaming_Source_PP, OneVPLSourcePerf_PP_Test,
                                source_description_preproc_t(files[2], codec[2], "", cif),
                                source_description_preproc_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11", full_hd),
                                source_description_preproc_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11", cif)));
+#elif __linux__
+INSTANTIATE_TEST_CASE_P(Streaming_Source_PP, OneVPLSourcePerf_PP_Test,
+                        Values(source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI", full_hd),
+                               source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI", cif),
+                               source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI",full_hd),
+                               source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI",cif)));
+#endif
 
 class OneVPLSourcePerf_PP_Engine_Test : public TestPerfParams<source_description_preproc_t> {};
 
@@ -198,6 +214,8 @@ PERF_TEST_P_(OneVPLSourcePerf_PP_Engine_Test, TestPerformance)
     std::unique_ptr<VPLAccelerationPolicy> policy;
     if (mode == "MFX_ACCEL_MODE_VIA_D3D11") {
         policy.reset(new VPLDX11AccelerationPolicy(device_selector));
+    } else if (mode == "MFX_ACCEL_MODE_VIA_VAAPI") {
+        policy.reset(new VPLVAAPIAccelerationPolicy(device_selector));
     } else if (mode.empty()){
         policy.reset(new VPLCPUAccelerationPolicy(device_selector));
     } else {
@@ -219,6 +237,7 @@ PERF_TEST_P_(OneVPLSourcePerf_PP_Engine_Test, TestPerformance)
     SANITY_CHECK_NOTHING();
 }
 
+#ifdef __WIN32__
 INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP, OneVPLSourcePerf_PP_Engine_Test,
                         Values(source_description_preproc_t(files[0], codec[0], "", full_hd),
                                source_description_preproc_t(files[0], codec[0], "", cif),
@@ -232,6 +251,13 @@ INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP, OneVPLSourcePerf_PP_Engine_Test,
                                source_description_preproc_t(files[2], codec[2], "", cif),
                                source_description_preproc_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11", full_hd),
                                source_description_preproc_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11", cif)));
+#elif __linux__
+INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP, OneVPLSourcePerf_PP_Engine_Test,
+                        Values(source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI", full_hd),
+                               source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI", cif),
+                               source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI",full_hd),
+                               source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI",cif)));
+#endif
 
 class OneVPLSourcePerf_PP_Engine_Bypass_Test : public TestPerfParams<source_description_preproc_t> {};
 
@@ -265,8 +291,12 @@ PERF_TEST_P_(OneVPLSourcePerf_PP_Engine_Bypass_Test, TestPerformance)
     std::unique_ptr<VPLAccelerationPolicy> policy;
     if (mode == "MFX_ACCEL_MODE_VIA_D3D11") {
         policy.reset(new VPLDX11AccelerationPolicy(device_selector));
-    } else {
+    } else if (mode == "MFX_ACCEL_MODE_VIA_VAAPI") {
+        policy.reset(new VPLVAAPIAccelerationPolicy(device_selector));
+    } else if (mode.empty()){
         policy.reset(new VPLCPUAccelerationPolicy(device_selector));
+    } else {
+        ASSERT_TRUE(false && "Unsupported acceleration policy type");
     }
     VPPPreprocEngine preproc_engine(std::move(policy));
     cv::gapi::wip::Data out;
@@ -288,6 +318,8 @@ static pp_out_param_t res_672x384 = pp_out_param_t {cv::MediaFormat::NV12,
                                                     {672, 384}};
 static pp_out_param_t res_336x256 = pp_out_param_t {cv::MediaFormat::NV12,
                                                     {336, 256}};
+
+#ifdef __WIN32__
 INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP_Bypass, OneVPLSourcePerf_PP_Engine_Bypass_Test,
                         Values(source_description_preproc_t(files[0], codec[0], "", res_672x384),
                                source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_D3D11", res_672x384),
@@ -295,6 +327,11 @@ INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP_Bypass, OneVPLSourcePerf_PP_Engine_B
                                source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_D3D11", res_672x384),
                                source_description_preproc_t(files[2], codec[2], "", res_336x256),
                                source_description_preproc_t(files[2], codec[2], "MFX_ACCEL_MODE_VIA_D3D11", res_336x256)));
+#elif __linux__
+INSTANTIATE_TEST_CASE_P(Streaming_Engine_PP_Bypass, OneVPLSourcePerf_PP_Engine_Bypass_Test,
+                        Values(source_description_preproc_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_VAAPI", res_672x384),
+                               source_description_preproc_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_VAAPI", res_672x384)));
+#endif
 } // namespace opencv_test
 
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
index 0a5f8f4a35..67ffdf9377 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
@@ -11,6 +11,7 @@
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
 #include "streaming/onevpl/accelerators/surface/cpu_frame_adapter.hpp"
 #include "streaming/onevpl/accelerators/surface/surface.hpp"
+#include "streaming/onevpl/utils.hpp"
 #include "logger.hpp"
 
 #ifdef _WIN32
@@ -22,7 +23,7 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 namespace utils {
-mfxU32 GetSurfaceSize_(mfxU32 FourCC, mfxU32 width, mfxU32 height) {
+static mfxU32 GetSurfaceSize_(mfxU32 FourCC, mfxU32 width, mfxU32 height) {
     mfxU32 nbytes = 0;
 
     mfxU32 half_width = width / 2;
@@ -47,10 +48,10 @@ mfxU32 GetSurfaceSize_(mfxU32 FourCC, mfxU32 width, mfxU32 height) {
     return nbytes;
 }
 
-surface_ptr_t create_surface_RGB4_(mfxFrameInfo frameInfo,
-                                   std::shared_ptr<void> out_buf_ptr,
-                                   size_t out_buf_ptr_offset,
-                                   size_t out_buf_size)
+static surface_ptr_t create_surface_RGB4_(mfxFrameInfo frameInfo,
+                                          std::shared_ptr<void> out_buf_ptr,
+                                          size_t out_buf_ptr_offset,
+                                          size_t out_buf_size)
 {
     mfxU8* buf = reinterpret_cast<mfxU8*>(out_buf_ptr.get());
     mfxU16 surfW = frameInfo.Width * 4;
@@ -80,10 +81,10 @@ surface_ptr_t create_surface_RGB4_(mfxFrameInfo frameInfo,
     return Surface::create_surface(std::move(handle), out_buf_ptr);
 }
 
-surface_ptr_t create_surface_other_(mfxFrameInfo frameInfo,
-                                    std::shared_ptr<void> out_buf_ptr,
-                                    size_t out_buf_ptr_offset,
-                                    size_t out_buf_size)
+static surface_ptr_t create_surface_other_(mfxFrameInfo frameInfo,
+                                           std::shared_ptr<void> out_buf_ptr,
+                                           size_t out_buf_ptr_offset,
+                                           size_t out_buf_size)
 {
     mfxU8* buf = reinterpret_cast<mfxU8*>(out_buf_ptr.get());
     mfxU16 surfH = frameInfo.Height;
@@ -155,8 +156,12 @@ VPLCPUAccelerationPolicy::create_surface_pool(size_t pool_size, size_t surface_s
     GAPI_LOG_DEBUG(nullptr, "page size: " << page_size_bytes << ", preallocated_raw_bytes: " << preallocated_raw_bytes);
     preallocated_pool_memory_ptr = _aligned_malloc(preallocated_raw_bytes, page_size_bytes);
 #else
-    GAPI_Assert(false && "Compatibility is not tested for systems differ than \"_WIN32\". "
-                         "Please feel free to set it up under OPENCV contribution policy");
+    int err = posix_memalign(&preallocated_pool_memory_ptr, page_size_bytes, preallocated_raw_bytes);
+    if (err) {
+        GAPI_LOG_WARNING(nullptr, "Cannot allocate aligned memory, size: " << preallocated_raw_bytes <<
+                                  ", alignment: " << page_size_bytes << ", error: " <<
+                                  strerror(err));
+    }
 #endif
 
     if (!preallocated_pool_memory_ptr) {
@@ -173,8 +178,9 @@ VPLCPUAccelerationPolicy::create_surface_pool(size_t pool_size, size_t surface_s
         GAPI_LOG_INFO(nullptr, "Released workspace memory: " << ptr);
         ptr = nullptr;
 #else
-        GAPI_Assert(false && "Not implemented for systems differ than \"_WIN32\". "
-                             "Please feel free to set it up under OPENCV contribution policy");
+        free(ptr);
+        GAPI_LOG_INFO(nullptr, "Released workspace memory: " << ptr);
+        ptr = nullptr;
 #endif
 
         });
@@ -220,10 +226,8 @@ VPLCPUAccelerationPolicy::create_surface_pool(const mfxFrameAllocRequest& alloc_
                                                       info.Width,
                                                       info.Height);
     if (!singleSurfaceSize) {
-        throw std::runtime_error("Cannot determine surface size for: fourCC: " +
-                                 std::to_string(info.FourCC) +
-                                 ", width: " + std::to_string(info.Width) +
-                                 ", height: " + std::to_string(info.Height));
+        throw std::runtime_error("Cannot determine surface size from frame: " +
+                                 mfx_frame_info_to_string(info));
     }
 
     auto surface_creator =
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.cpp
index 932d7d7842..dba05f0169 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.cpp
@@ -13,8 +13,7 @@
 #include "streaming/onevpl/utils.hpp"
 #include "logger.hpp"
 
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
 #pragma comment(lib,"d3d11.lib")
 
 #define D3D11_NO_HELPERS
@@ -441,6 +440,52 @@ mfxStatus VPLDX11AccelerationPolicy::on_free(mfxFrameAllocResponse *response) {
 } // namespace wip
 } // namespace gapi
 } // namespace cv
-#endif // HAVE_D3D11
-#endif // HAVE_DIRECTX
+
+#else // #if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+VPLDX11AccelerationPolicy::VPLDX11AccelerationPolicy(device_selector_ptr_t selector) :
+    VPLAccelerationPolicy(selector) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+VPLDX11AccelerationPolicy::~VPLDX11AccelerationPolicy() = default;
+
+void VPLDX11AccelerationPolicy::init(session_t ) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+void VPLDX11AccelerationPolicy::deinit(session_t) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+VPLDX11AccelerationPolicy::pool_key_t VPLDX11AccelerationPolicy::create_surface_pool(const mfxFrameAllocRequest&,
+                                                                                     mfxFrameInfo&) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+VPLDX11AccelerationPolicy::surface_weak_ptr_t VPLDX11AccelerationPolicy::get_free_surface(pool_key_t) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+size_t VPLDX11AccelerationPolicy::get_free_surface_count(pool_key_t) const {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+size_t VPLDX11AccelerationPolicy::get_surface_count(pool_key_t) const {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+
+cv::MediaFrame::AdapterPtr VPLDX11AccelerationPolicy::create_frame_adapter(pool_key_t,
+                                                                          const FrameConstructorArgs &) {
+    GAPI_Assert(false && "VPLDX11AccelerationPolicy unavailable in current configuration");
+}
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+#endif // #if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.hpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.hpp
index 61513a45af..f82ae64537 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.hpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_dx11.hpp
@@ -15,8 +15,7 @@
 #include "streaming/onevpl/accelerators/surface/surface_pool.hpp"
 #include "streaming/onevpl/accelerators/dx11_alloc_resource.hpp"
 
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
 #define D3D11_NO_HELPERS
 #define NOMINMAX
 #include <d3d11.h>
@@ -24,7 +23,9 @@
 #include "opencv2/core/directx.hpp"
 #ifdef HAVE_OPENCL
 #include <CL/cl_d3d11.h>
-#endif
+#endif // HAVE_OPENCL
+#undef NOMINMAX
+#endif // HAVE_DIRECTX && HAVE_D3D11
 
 namespace cv {
 namespace gapi {
@@ -51,6 +52,8 @@ struct GAPI_EXPORTS VPLDX11AccelerationPolicy final: public VPLAccelerationPolic
     cv::MediaFrame::AdapterPtr create_frame_adapter(pool_key_t key,
                                                     const FrameConstructorArgs &params) override;
 private:
+#ifdef HAVE_DIRECTX
+#ifdef HAVE_D3D11
     ID3D11Device *hw_handle;
     ID3D11DeviceContext* device_context;
 
@@ -75,14 +78,13 @@ private:
     std::map<alloc_id_t, allocation_t> allocation_table;
 
     std::map<pool_key_t, pool_t> pool_table;
+#endif // HAVE_D3D11
+#endif // HAVE_DIRECTX
 };
 } // namespace onevpl
 } // namespace wip
 } // namespace gapi
 } // namespace cv
-#undef NOMINMAX
-#endif // HAVE_D3D11
-#endif // HAVE_DIRECTX
 
 #endif // HAVE_ONEVPL
 #endif // GAPI_STREAMING_ONEVPL_ACCELERATORS_ACCEL_POLICY_DX11_HPP
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
new file mode 100644
index 0000000000..8fa0be9914
--- /dev/null
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
@@ -0,0 +1,150 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifdef HAVE_ONEVPL
+#include <cstdlib>
+#include <exception>
+#include <stdint.h>
+
+#ifdef __linux__
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#endif // __linux__
+
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
+#include "streaming/onevpl/utils.hpp"
+#include "logger.hpp"
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+#ifdef __linux__
+VPLVAAPIAccelerationPolicy::VPLVAAPIAccelerationPolicy(device_selector_ptr_t selector) :
+    VPLAccelerationPolicy(selector),
+    cpu_dispatcher(new VPLCPUAccelerationPolicy(selector)),
+    va_handle(),
+    device_fd(-1) {
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+    // TODO Move it out in device selector
+    device_fd = open("/dev/dri/renderD128", O_RDWR);
+    if (device_fd < 0) {
+        GAPI_LOG_WARNING(nullptr, "VAAPI device descriptor \"/dev/dri/renderD128\" has not found");
+        throw std::runtime_error("cannot open VAAPI device");
+    }
+    va_handle = vaGetDisplayDRM(device_fd);
+    if (!va_handle) {
+        GAPI_LOG_WARNING(nullptr, "VAAPI device vaGetDisplayDRM failed, error: " << strerror(errno));
+        close(device_fd);
+        throw std::runtime_error("vaGetDisplayDRM failed");
+    }
+    int major_version = 0, minor_version = 0;
+    VAStatus status {};
+    status = vaInitialize(va_handle, &major_version, &minor_version);
+    if (VA_STATUS_SUCCESS != status) {
+        GAPI_LOG_WARNING(nullptr, "Cannot initialize VAAPI device, error: " << vaErrorStr(status));
+        close(device_fd);
+        throw std::runtime_error("vaInitialize failed");
+    }
+    GAPI_LOG_INFO(nullptr, "created");
+#else  // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+}
+
+VPLVAAPIAccelerationPolicy::~VPLVAAPIAccelerationPolicy() {
+    vaTerminate(va_handle);
+    close(device_fd);
+    GAPI_LOG_INFO(nullptr, "destroyed");
+}
+
+void VPLVAAPIAccelerationPolicy::init(session_t session) {
+    GAPI_LOG_INFO(nullptr, "session: " << session);
+
+    cpu_dispatcher->init(session);
+    mfxStatus sts = MFXVideoCORE_SetHandle(session,
+                                           static_cast<mfxHandleType>(MFX_HANDLE_VA_DISPLAY),
+                                           va_handle);
+    if (sts != MFX_ERR_NONE)
+    {
+        throw std::logic_error("Cannot create VPLVAAPIAccelerationPolicy, MFXVideoCORE_SetHandle error: " +
+                               mfxstatus_to_string(sts));
+    }
+    GAPI_LOG_INFO(nullptr, "finished successfully, session: " << session);
+}
+
+void VPLVAAPIAccelerationPolicy::deinit(session_t session) {
+    GAPI_LOG_INFO(nullptr, "session: " << session);
+}
+
+VPLVAAPIAccelerationPolicy::pool_key_t
+VPLVAAPIAccelerationPolicy::create_surface_pool(const mfxFrameAllocRequest& alloc_request, mfxFrameInfo& info) {
+
+    return cpu_dispatcher->create_surface_pool(alloc_request, info);
+}
+
+VPLVAAPIAccelerationPolicy::surface_weak_ptr_t VPLVAAPIAccelerationPolicy::get_free_surface(pool_key_t key) {
+    return cpu_dispatcher->get_free_surface(key);
+}
+
+size_t VPLVAAPIAccelerationPolicy::get_free_surface_count(pool_key_t key) const {
+    return cpu_dispatcher->get_free_surface_count(key);
+}
+
+size_t VPLVAAPIAccelerationPolicy::get_surface_count(pool_key_t key) const {
+    return cpu_dispatcher->get_surface_count(key);
+}
+
+cv::MediaFrame::AdapterPtr VPLVAAPIAccelerationPolicy::create_frame_adapter(pool_key_t key,
+                                                                          const FrameConstructorArgs &params) {
+    return cpu_dispatcher->create_frame_adapter(key, params);
+}
+
+#else // __linux__
+
+VPLVAAPIAccelerationPolicy::VPLVAAPIAccelerationPolicy(device_selector_ptr_t selector) :
+    VPLAccelerationPolicy(selector) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+VPLVAAPIAccelerationPolicy::~VPLVAAPIAccelerationPolicy() = default;
+
+void VPLVAAPIAccelerationPolicy::init(session_t ) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+void VPLVAAPIAccelerationPolicy::deinit(session_t) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+VPLVAAPIAccelerationPolicy::pool_key_t VPLVAAPIAccelerationPolicy::create_surface_pool(const mfxFrameAllocRequest&,
+                                                                                     mfxFrameInfo&) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+VPLVAAPIAccelerationPolicy::surface_weak_ptr_t VPLVAAPIAccelerationPolicy::get_free_surface(pool_key_t) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+size_t VPLVAAPIAccelerationPolicy::get_free_surface_count(pool_key_t) const {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+size_t VPLVAAPIAccelerationPolicy::get_surface_count(pool_key_t) const {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+
+cv::MediaFrame::AdapterPtr VPLVAAPIAccelerationPolicy::create_frame_adapter(pool_key_t,
+                                                                          const FrameConstructorArgs &) {
+    GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
+}
+#endif // __linux__
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+#endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp
new file mode 100644
index 0000000000..ee7453f982
--- /dev/null
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp
@@ -0,0 +1,62 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef GAPI_STREAMING_ONEVPL_ACCELERATORS_ACCEL_POLICY_VA_API_HPP
+#define GAPI_STREAMING_ONEVPL_ACCELERATORS_ACCEL_POLICY_VA_API_HPP
+
+#include <map>
+#include <vector>
+
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+
+#ifdef HAVE_ONEVPL
+#include "streaming/onevpl/accelerators/accel_policy_interface.hpp"
+#include "streaming/onevpl/accelerators/surface/surface_pool.hpp"
+
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#include "va/va.h"
+#include "va/va_drm.h"
+#else
+    typedef void* VADisplay;
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // __linux__
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+// GAPI_EXPORTS for tests
+struct GAPI_EXPORTS VPLVAAPIAccelerationPolicy final : public VPLAccelerationPolicy
+{
+    VPLVAAPIAccelerationPolicy(device_selector_ptr_t selector);
+    ~VPLVAAPIAccelerationPolicy();
+
+    using pool_t = CachedPool;
+
+    void init(session_t session) override;
+    void deinit(session_t session) override;
+    pool_key_t create_surface_pool(const mfxFrameAllocRequest& alloc_request, mfxFrameInfo& info) override;
+    surface_weak_ptr_t get_free_surface(pool_key_t key) override;
+    size_t get_free_surface_count(pool_key_t key) const override;
+    size_t get_surface_count(pool_key_t key) const override;
+
+    cv::MediaFrame::AdapterPtr create_frame_adapter(pool_key_t key,
+                                                    const FrameConstructorArgs& args) override;
+
+private:
+    std::unique_ptr<VPLAccelerationPolicy> cpu_dispatcher;
+#ifdef __linux__
+    VADisplay va_handle;
+    int device_fd;  // TODO Move it out in device selector
+#endif // __linux__
+};
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // HAVE_ONEVPL
+#endif // GAPI_STREAMING_ONEVPL_ACCELERATORS_ACCEL_POLICY_VA_API_HPP
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.cpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.cpp
index 76da3dbe50..6f2dbd137a 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.cpp
@@ -59,7 +59,7 @@ std::shared_ptr<Surface> BaseFrameAdapter::surface() {
     return parent_surface_ptr;
 }
 
-const BaseFrameAdapter::SessionHandle BaseFrameAdapter::get_session_handle() const {
+BaseFrameAdapter::SessionHandle BaseFrameAdapter::get_session_handle() const {
     return parent_handle;
 }
 
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.hpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.hpp
index a3dfcf542f..4231131b84 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.hpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/base_frame_adapter.hpp
@@ -23,7 +23,7 @@ public:
     using SessionHandle = mfxSession;
 
     const std::shared_ptr<Surface>& get_surface() const;
-    const SessionHandle get_session_handle() const;
+    SessionHandle get_session_handle() const;
 
     cv::GFrameDesc meta() const override;
     AccelType accel_type() const;
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/dx11_frame_adapter.cpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/dx11_frame_adapter.cpp
index 885fa1589a..fad26e50a8 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/dx11_frame_adapter.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/dx11_frame_adapter.cpp
@@ -12,6 +12,8 @@
 #ifdef HAVE_ONEVPL
 #include "streaming/onevpl/onevpl_export.hpp"
 
+#ifdef HAVE_DIRECTX
+#ifdef HAVE_D3D11
 #ifdef HAVE_INF_ENGINE
 // For IE classes (ParamMap, etc)
 #include <inference_engine.hpp>
@@ -202,4 +204,6 @@ DXGI_FORMAT VPLMediaFrameDX11Adapter::get_dx11_color_format(uint32_t mfx_fourcc)
 } // namespace wip
 } // namespace gapi
 } // namespace cv
+#endif // HAVE_D3D11
+#endif // HAVE_DIRECTX
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/surface.cpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/surface.cpp
index c09dc80338..88df8b2f94 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/surface.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/surface.cpp
@@ -4,6 +4,7 @@
 //
 // Copyright (C) 2021 Intel Corporation
 
+#include <opencv2/gapi/own/assert.hpp>
 #include "streaming/onevpl/accelerators/surface/surface.hpp"
 #include "logger.hpp"
 
@@ -20,12 +21,12 @@ Surface::Surface(std::unique_ptr<handle_t>&& surf, std::shared_ptr<void> associa
     mirrored_locked_count() {
 
     GAPI_Assert(mfx_surface && "Surface is nullptr");
-    GAPI_LOG_DEBUG(nullptr, "create surface: " << mfx_surface <<
+    GAPI_LOG_DEBUG(nullptr, "create surface: " << get_handle() <<
                             ", locked count: " << mfx_surface->Data.Locked);
 }
 
 Surface::~Surface() {
-    GAPI_LOG_DEBUG(nullptr, "destroy surface: " << mfx_surface <<
+    GAPI_LOG_DEBUG(nullptr, "destroy surface: " << get_handle() <<
                             ", worspace memory counter: " <<
                             workspace_memory_ptr.use_count());
 }
@@ -60,7 +61,7 @@ size_t Surface::get_locks_count() const {
 
 size_t Surface::obtain_lock() {
     size_t locked_count = mirrored_locked_count.fetch_add(1);
-    GAPI_LOG_DEBUG(nullptr, "surface: " << mfx_surface.get() <<
+    GAPI_LOG_DEBUG(nullptr, "surface: " << get_handle() <<
                             ", locked times: " << locked_count + 1);
     return locked_count; // return preceding value
 }
@@ -68,7 +69,7 @@ size_t Surface::obtain_lock() {
 size_t Surface::release_lock() {
     size_t locked_count = mirrored_locked_count.fetch_sub(1);
     GAPI_Assert(locked_count && "Surface lock counter is invalid");
-    GAPI_LOG_DEBUG(nullptr, "surface: " << mfx_surface.get() <<
+    GAPI_LOG_DEBUG(nullptr, "surface: " << get_handle() <<
                             ", locked times: " << locked_count - 1);
     return locked_count; // return preceding value
 }
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/surface_pool.cpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/surface_pool.cpp
index bd6a0c69d1..4547413693 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/surface_pool.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/surface_pool.cpp
@@ -1,3 +1,4 @@
+#include <opencv2/gapi/own/assert.hpp>
 #include "streaming/onevpl/accelerators/surface/surface_pool.hpp"
 #include "streaming/onevpl/accelerators/surface/surface.hpp"
 #include "logger.hpp"
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
index b3beb71fb1..90bf3e8849 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
@@ -9,6 +9,7 @@
 #include <opencv2/gapi/own/assert.hpp>
 #include <opencv2/gapi/util/variant.hpp>
 
+#include <opencv2/gapi/streaming/onevpl/device_selector_interface.hpp>
 #include "streaming/onevpl/cfg_param_device_selector.hpp"
 #include "streaming/onevpl/cfg_params_parser.hpp"
 #include "streaming/onevpl/utils.hpp"
@@ -26,25 +27,27 @@
 #pragma comment(lib, "dxgi")
 #undef D3D11_NO_HELPERS
 #undef NOMINMAX
+#endif // HAVE_D3D11
+#endif // HAVE_DIRECTX
 
 #include <codecvt>
 #include "opencv2/core/directx.hpp"
-#ifdef HAVE_OPENCL
-#include <CL/cl_d3d11.h>
-#endif
 
 namespace cv {
 namespace gapi {
 namespace wip {
 namespace onevpl {
 
-std::vector<CfgParam> insertCfgparam(std::vector<CfgParam> &&param_array, AccelType type) {
+static std::vector<CfgParam> insertCfgparam(std::vector<CfgParam> &&param_array, AccelType type) {
     switch (type) {
         case AccelType::HOST:
             break;
         case AccelType::DX11:
             param_array.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
             break;
+        case AccelType::VAAPI:
+            param_array.push_back(CfgParam::create_acceleration_mode(MFX_IMPL_VIA_VAAPI));
+            break;
         default:
             GAPI_DbgAssert(false && "Unexpected AccelType");
             break;
@@ -71,8 +74,7 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const CfgParams& cfg_params) :
 
     switch(accel_mode.Data.U32) {
         case MFX_ACCEL_MODE_VIA_D3D11: {
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             ID3D11Device *hw_handle = nullptr;
             ID3D11DeviceContext* device_context = nullptr;
 
@@ -141,14 +143,17 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const CfgParams& cfg_params) :
 
             suggested_device = IDeviceSelector::create<Device>(hw_handle, "GPU", AccelType::DX11);
             suggested_context = IDeviceSelector::create<Context>(device_context, AccelType::DX11);
-#else
+#else  // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             GAPI_LOG_WARNING(nullptr, "Unavailable \"" <<  CfgParam::acceleration_mode_name() << ": MFX_ACCEL_MODE_VIA_D3D11\""
                                       "was chosen for current project configuration");
             throw std::logic_error(std::string("Unsupported \"") +
                                    CfgParam::acceleration_mode_name() +
                                    ": MFX_ACCEL_MODE_VIA_D3D11\"");
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
+            break;
+        }
+        case MFX_IMPL_VIA_VAAPI : {
+            GAPI_LOG_WARNING(nullptr, "TODO MFX_IMPL_VIA_VAAPI falls back to CPU case")
             break;
         }
         case MFX_ACCEL_MODE_NA: {
@@ -198,10 +203,10 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(Device::Ptr device_ptr,
     }
     mfxVariant accel_mode = cfg_param_to_mfx_variant(*accel_mode_it);
 
+    cv::util::suppress_unused_warning(device_id);
     switch(accel_mode.Data.U32) {
         case MFX_ACCEL_MODE_VIA_D3D11: {
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             suggested_device = IDeviceSelector::create<Device>(device_ptr, device_id, AccelType::DX11);
             ID3D11Device* dx_device_ptr =
                 reinterpret_cast<ID3D11Device*>(suggested_device.get_ptr());
@@ -220,14 +225,13 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(Device::Ptr device_ptr,
             }
 
             dx_ctx_ptr->AddRef();
-#else
+#else  // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             GAPI_LOG_WARNING(nullptr, "Unavailable \"" <<  CfgParam::acceleration_mode_name() <<
                                       ": MFX_ACCEL_MODE_VIA_D3D11\""
                                       "was chosen for current project configuration");
             throw std::logic_error(std::string("Unsupported \"") +
                                    CfgParam::acceleration_mode_name() + ": MFX_ACCEL_MODE_VIA_D3D11\"");
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // #if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             break;
         }
         case MFX_ACCEL_MODE_NA: {
@@ -253,8 +257,7 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const Device &device,
 
     switch(device.get_type()) {
         case AccelType::DX11: {
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             ID3D11Device* dx_device_ptr =
                 reinterpret_cast<ID3D11Device*>(suggested_device.get_ptr());
             dx_device_ptr->AddRef();
@@ -272,15 +275,17 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const Device &device,
 
             dx_ctx_ptr->AddRef();
             break;
-#else
+#else // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             GAPI_LOG_WARNING(nullptr, "Unavailable \"" <<  CfgParam::acceleration_mode_name() <<
                                       ": MFX_ACCEL_MODE_VIA_D3D11\""
                                       "was chosen for current project configuration");
             throw std::logic_error(std::string("Unsupported \"") +
                                    CfgParam::acceleration_mode_name() + ": MFX_ACCEL_MODE_VIA_D3D11\"");
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
         }
+        case AccelType::VAAPI:
+            GAPI_LOG_WARNING(nullptr, "TODO MFX_IMPL_VIA_VAAPI falls back to CPU case")
+            break;
         case AccelType::HOST:
             break;
         default:
@@ -299,14 +304,12 @@ CfgParamDeviceSelector::~CfgParamDeviceSelector() {
             //nothing to do
             break;
         case AccelType::DX11: {
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             ID3D11DeviceContext* device_ctx_ptr =
                 reinterpret_cast<ID3D11DeviceContext*>(suggested_context.get_ptr());
             device_ctx_ptr->Release();
             device_ctx_ptr = nullptr;
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             break;
         }
         default:
@@ -322,13 +325,11 @@ CfgParamDeviceSelector::~CfgParamDeviceSelector() {
             //nothing to do
             break;
         case AccelType::DX11: {
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             ID3D11Device* device_ptr = reinterpret_cast<ID3D11Device*>(suggested_device.get_ptr());
             device_ptr->Release();
             device_ptr = nullptr;
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             break;
         }
         default:
@@ -337,7 +338,7 @@ CfgParamDeviceSelector::~CfgParamDeviceSelector() {
 }
 
 CfgParamDeviceSelector::DeviceScoreTable CfgParamDeviceSelector::select_devices() const {
-    return {std::make_pair(Score::MaxActivePriority, suggested_device)};
+    return {std::make_pair(Score::Type(Score::MaxActivePriority), suggested_device)};
 }
 
 CfgParamDeviceSelector::DeviceContexts CfgParamDeviceSelector::select_context() {
@@ -348,6 +349,4 @@ CfgParamDeviceSelector::DeviceContexts CfgParamDeviceSelector::select_context()
 } // namespace wip
 } // namespace gapi
 } // namespace cv
-#endif // HAVE_D3D11
-#endif // HAVE_DIRECTX
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/cfg_params_parser.cpp b/modules/gapi/src/streaming/onevpl/cfg_params_parser.cpp
index afb92f4443..9f5a68a431 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_params_parser.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_params_parser.cpp
@@ -87,15 +87,15 @@ std::vector<ValueType> get_params_from_string(const std::string& str) {
 
         ParamCreator<ValueType> creator;
         if (name == CfgParam::implementation_name()) {
-            ret.push_back(creator.create<mfxU32>(name, cstr_to_mfx_impl(value.c_str())));
+            ret.push_back(creator.template create<mfxU32>(name, cstr_to_mfx_impl(value.c_str())));
         } else if (name == CfgParam::decoder_id_name()) {
-            ret.push_back(creator.create<mfxU32>(name, cstr_to_mfx_codec_id(value.c_str())));
+            ret.push_back(creator.template create<mfxU32>(name, cstr_to_mfx_codec_id(value.c_str())));
         } else if (name == CfgParam::acceleration_mode_name()) {
-            ret.push_back(creator.create<mfxU32>(name, cstr_to_mfx_accel_mode(value.c_str())));
+            ret.push_back(creator.template create<mfxU32>(name, cstr_to_mfx_accel_mode(value.c_str())));
         } else if (name == "mfxImplDescription.ApiVersion.Version") {
-            ret.push_back(creator.create<mfxU32>(name, cstr_to_mfx_version(value.c_str())));
+            ret.push_back(creator.template create<mfxU32>(name, cstr_to_mfx_version(value.c_str())));
         } else if ((name == CfgParam::frames_pool_size_name()) || (name == CfgParam::vpp_frames_pool_size_name())) {
-            ret.push_back(creator.create(name, strtoull_or_throw(value.c_str()), false));
+            ret.push_back(creator.create(name, static_cast<mfxU32>(strtoull_or_throw(value.c_str()), false)));
         } else if ((name == CfgParam::vpp_in_width_name()) || (name == CfgParam::vpp_in_height_name()) ||
                    (name == CfgParam::vpp_in_crop_w_name()) || (name == CfgParam::vpp_in_crop_h_name()) ||
                    (name == CfgParam::vpp_in_crop_x_name()) || (name == CfgParam::vpp_in_crop_y_name()) ||
@@ -186,7 +186,7 @@ unsigned long strtoul_or_throw(const char* str) {
     errno = 0;
     unsigned long ret = strtoul(str, &end_ptr, 10);
     if ((end_ptr == str) ||
-        ((ret == ULONG_MAX || ret == LONG_MIN) && errno == ERANGE)) {
+        ((ret == ULONG_MAX) && errno == ERANGE)) {
             // nothing parsed from the string, handle errors or exit
         GAPI_LOG_WARNING(nullptr, "strtoul failed for: " << str);
         GAPI_Assert(false && "strtoul_or_throw");
@@ -199,7 +199,7 @@ size_t strtoull_or_throw(const char* str) {
     errno = 0;
     size_t ret = strtoull(str, &end_ptr, 10);
     if ((end_ptr == str) ||
-        ((ret == LONG_MAX || ret == LONG_MIN) && errno == ERANGE)) {
+        ((ret == ULLONG_MAX) && errno == ERANGE)) {
             // nothing parsed from the string, handle errors or exit
         GAPI_LOG_WARNING(nullptr, "strtoull failed for: " << str);
         GAPI_Assert(false && "strtoull_or_throw");
diff --git a/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.cpp b/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.cpp
index 85d7d1aaf0..5d139af824 100644
--- a/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.cpp
+++ b/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.cpp
@@ -14,6 +14,7 @@
 #pragma comment(lib, "mfreadwrite.lib")
 #endif // _WIN32
 
+#include <opencv2/gapi/own/assert.hpp>
 #include "streaming/onevpl/demux/async_mfp_demux_data_provider.hpp"
 #include "logger.hpp"
 
@@ -807,7 +808,7 @@ bool MFPAsyncDemuxDataProvider::fetch_bitstream_data(std::shared_ptr<mfx_bitstre
     return false;
 }
 
-bool MFPAsyncDemuxDataProvider::empty() const override {
+bool MFPAsyncDemuxDataProvider::empty() const {
     GAPI_Assert(false && "Unsupported: Microsoft Media Foundation is not available");
     return true;
 }
diff --git a/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.hpp b/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.hpp
index f63593a46e..aab237f413 100644
--- a/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.hpp
+++ b/modules/gapi/src/streaming/onevpl/demux/async_mfp_demux_data_provider.hpp
@@ -14,6 +14,7 @@
 
 #ifdef HAVE_ONEVPL
 #include "streaming/onevpl/onevpl_export.hpp"
+#include <opencv2/gapi/streaming/onevpl/data_provider_interface.hpp>
 
 #ifdef _WIN32
 #define NOMINMAX
@@ -28,7 +29,6 @@
 #include <wmcodecdsp.h>
 #undef NOMINMAX
 
-#include <opencv2/gapi/streaming/onevpl/data_provider_interface.hpp>
 #include "streaming/onevpl/data_provider_defines.hpp"
 #include "streaming/onevpl/utils.hpp"
 
diff --git a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
index 404b2f3872..d58d1d3d3c 100644
--- a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
+++ b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
@@ -20,6 +20,8 @@ const char* to_cstring(AccelType type) {
             return "HOST";
         case AccelType::DX11:
             return "DX11";
+        case AccelType::VAAPI:
+            return "VAAPI";
         default:
             GAPI_DbgAssert(false && "Unexpected AccelType");
             break;
diff --git a/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp b/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
index b80ad2e12e..34db1bebfa 100644
--- a/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
+++ b/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
@@ -175,6 +175,7 @@ VPLLegacyDecodeEngine::SessionParam VPLLegacyDecodeEngine::prepare_session_param
 
     // Prepare video param
     mfxVideoParam mfxDecParams {};
+    memset(&mfxDecParams, 0, sizeof(mfxDecParams));
     mfxDecParams.mfx.CodecId = decoder_id_name;
 
     // set memory stream direction according to acceleration policy device type
@@ -253,7 +254,8 @@ VPLLegacyDecodeEngine::SessionParam VPLLegacyDecodeEngine::prepare_session_param
 
     // Input parameters finished, now initialize decode
     // create decoder for session according to header recovered from source file
-
+    GAPI_LOG_INFO(nullptr, "Initialize decoder for session: " << mfx_session <<
+                           ", frame info: " << mfx_frame_info_to_string(mfxDecParams.mfx.FrameInfo));
     sts = MFXVideoDECODE_Init(mfx_session, &mfxDecParams);
     if (MFX_ERR_NONE != sts) {
         throw std::runtime_error("Error initializing Decode, error: " +
diff --git a/modules/gapi/src/streaming/onevpl/engine/decode/decode_session.hpp b/modules/gapi/src/streaming/onevpl/engine/decode/decode_session.hpp
index e87ce6cb65..676564f066 100644
--- a/modules/gapi/src/streaming/onevpl/engine/decode/decode_session.hpp
+++ b/modules/gapi/src/streaming/onevpl/engine/decode/decode_session.hpp
@@ -21,6 +21,9 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 class Surface;
+struct VPLAccelerationPolicy;
+class VPLLegacyDecodeEngine;
+
 class GAPI_EXPORTS LegacyDecodeSession : public EngineSession {
 public:
     friend class VPLLegacyDecodeEngine;
diff --git a/modules/gapi/src/streaming/onevpl/engine/engine_session.hpp b/modules/gapi/src/streaming/onevpl/engine/engine_session.hpp
index d38dad3812..c684941069 100644
--- a/modules/gapi/src/streaming/onevpl/engine/engine_session.hpp
+++ b/modules/gapi/src/streaming/onevpl/engine/engine_session.hpp
@@ -14,6 +14,7 @@
 #include <utility>
 #include <vector>
 
+#include "opencv2/gapi/util/optional.hpp"
 #include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
 #include <opencv2/gapi/streaming/onevpl/data_provider_interface.hpp>
 #include "streaming/onevpl/data_provider_defines.hpp"
@@ -31,7 +32,7 @@ namespace onevpl {
 struct GAPI_EXPORTS DecoderParams {
     std::shared_ptr<IDataProvider::mfx_bitstream> stream;
     mfxVideoParam param;
-    cv::optional<size_t> preallocated_frames_count;
+    cv::util::optional<size_t> preallocated_frames_count;
 };
 
 struct GAPI_EXPORTS TranscoderParams {
diff --git a/modules/gapi/src/streaming/onevpl/engine/preproc/preproc_engine.cpp b/modules/gapi/src/streaming/onevpl/engine/preproc/preproc_engine.cpp
index 2e9a9bebce..10ce92e20a 100644
--- a/modules/gapi/src/streaming/onevpl/engine/preproc/preproc_engine.cpp
+++ b/modules/gapi/src/streaming/onevpl/engine/preproc/preproc_engine.cpp
@@ -34,8 +34,8 @@ bool FrameInfoComparator::equal_to(const mfxFrameInfo& lhs, const mfxFrameInfo&
     return lhs == rhs;
 }
 
-void apply_roi(mfxFrameSurface1* surface_handle,
-               const cv::util::optional<cv::Rect> &opt_roi) {
+static void apply_roi(mfxFrameSurface1* surface_handle,
+                      const cv::util::optional<cv::Rect> &opt_roi) {
     if (opt_roi.has_value()) {
         const cv::Rect &roi = opt_roi.value();
         surface_handle->Info.CropX = static_cast<mfxU16>(roi.x);
@@ -176,7 +176,8 @@ pp_session VPPPreprocEngine::initialize_preproc(const pp_params& initial_frame_p
     const vpp_pp_params &params = initial_frame_param.get<vpp_pp_params>();
 
     // adjust preprocessing settings
-    mfxVideoParam mfxVPPParams{0};
+    mfxVideoParam mfxVPPParams{};
+    memset(&mfxVPPParams, 0, sizeof(mfxVideoParam));
     // NB: IN params for VPP session must be equal to decoded surface params
     mfxVPPParams.vpp.In = params.info;
 
diff --git a/modules/gapi/src/streaming/onevpl/engine/preproc/vpp_preproc_defines.hpp b/modules/gapi/src/streaming/onevpl/engine/preproc/vpp_preproc_defines.hpp
index 780c9cf5d7..e500c6466e 100644
--- a/modules/gapi/src/streaming/onevpl/engine/preproc/vpp_preproc_defines.hpp
+++ b/modules/gapi/src/streaming/onevpl/engine/preproc/vpp_preproc_defines.hpp
@@ -16,12 +16,18 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 struct vpp_pp_params {
+    vpp_pp_params() : handle(), info(), reserved() {}
+    vpp_pp_params(mfxSession s, mfxFrameInfo i, void *r = nullptr) :
+        handle(s), info(i), reserved(r) {}
     mfxSession handle;
     mfxFrameInfo info;
     void *reserved = nullptr;
 };
 
 struct vpp_pp_session {
+    vpp_pp_session() : handle(), reserved() {}
+    vpp_pp_session(std::shared_ptr<EngineSession> h, void *r = nullptr) :
+        handle(h), reserved(r) {}
     std::shared_ptr<EngineSession> handle;
     void *reserved = nullptr;
 };
diff --git a/modules/gapi/src/streaming/onevpl/engine/processing_engine_base.hpp b/modules/gapi/src/streaming/onevpl/engine/processing_engine_base.hpp
index cacc8bd748..54c6f4e2c4 100644
--- a/modules/gapi/src/streaming/onevpl/engine/processing_engine_base.hpp
+++ b/modules/gapi/src/streaming/onevpl/engine/processing_engine_base.hpp
@@ -9,6 +9,7 @@
 
 #include <queue>
 #include <opencv2/gapi/streaming/onevpl/cfg_params.hpp>
+#include <opencv2/gapi/garg.hpp>
 #include "streaming/onevpl/engine/engine_session.hpp"
 #include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
 
diff --git a/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_engine_legacy.cpp b/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_engine_legacy.cpp
index 6226061925..23703bf172 100644
--- a/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_engine_legacy.cpp
+++ b/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_engine_legacy.cpp
@@ -269,7 +269,8 @@ VPLLegacyTranscodeEngine::initialize_session(mfxSession mfx_session,
     const auto& mfxDecParams = decode_params.decoder_params.param;
 
     // NB: create transcode params: Out = In by default, In = initially decoded
-    mfxVideoParam mfxVPPParams{0};
+    mfxVideoParam mfxVPPParams{};
+    memset(&mfxVPPParams, 0, sizeof(mfxVPPParams));
     mfxVPPParams.vpp.In = mfxDecParams.mfx.FrameInfo;
     mfxVPPParams.vpp.Out = mfxVPPParams.vpp.In;
 
@@ -366,6 +367,8 @@ VPLLegacyTranscodeEngine::initialize_session(mfxSession mfx_session,
     VPLAccelerationPolicy::pool_key_t vpp_out_pool_key =
                 acceleration_policy->create_surface_pool(vppRequests[1], mfxVPPParams.vpp.Out);
 
+    GAPI_LOG_INFO(nullptr, "Initialize VPP for session: " << mfx_session <<
+                           ", out frame info: " << mfx_frame_info_to_string(mfxVPPParams.vpp.Out));
     sts = MFXVideoVPP_Init(mfx_session, &mfxVPPParams);
     if (MFX_ERR_NONE != sts) {
         GAPI_LOG_WARNING(nullptr, "cannot Init VPP");
diff --git a/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_session.hpp b/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_session.hpp
index ad9ed8e60b..03df163750 100644
--- a/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_session.hpp
+++ b/modules/gapi/src/streaming/onevpl/engine/transcode/transcode_session.hpp
@@ -15,7 +15,7 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 class Surface;
-
+class VPLLegacyTranscodeEngine;
 class GAPI_EXPORTS LegacyTranscodeSession : public LegacyDecodeSession {
 public:
     friend class VPLLegacyTranscodeEngine;
diff --git a/modules/gapi/src/streaming/onevpl/onevpl_export.hpp b/modules/gapi/src/streaming/onevpl/onevpl_export.hpp
index e93a30d869..5e2dc04478 100644
--- a/modules/gapi/src/streaming/onevpl/onevpl_export.hpp
+++ b/modules/gapi/src/streaming/onevpl/onevpl_export.hpp
@@ -10,9 +10,11 @@
 #endif // defined(_MSC_VER)
 
 #ifdef HAVE_ONEVPL
+#if defined(MFX_VERSION)
 #if (MFX_VERSION >= 2000)
 #include <vpl/mfxdispatcher.h>
 #endif // MFX_VERSION
+#endif // defined(MFX_VERSION)
 
 #include <vpl/mfx.h>
 #include <vpl/mfxvideo.h>
diff --git a/modules/gapi/src/streaming/onevpl/source_priv.cpp b/modules/gapi/src/streaming/onevpl/source_priv.cpp
index 7afe1cde56..765bdd3b64 100644
--- a/modules/gapi/src/streaming/onevpl/source_priv.cpp
+++ b/modules/gapi/src/streaming/onevpl/source_priv.cpp
@@ -11,6 +11,7 @@
 #include "streaming/onevpl/engine/transcode/transcode_engine_legacy.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_dx11.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
 #include "streaming/onevpl/utils.hpp"
 #include "streaming/onevpl/cfg_params_parser.hpp"
 #include "streaming/onevpl/data_provider_defines.hpp"
@@ -294,6 +295,12 @@ std::unique_ptr<VPLAccelerationPolicy> GSource::Priv::initializeHWAccel(std::sha
             ret = std::move(cand);
             break;
         }
+        case MFX_ACCEL_MODE_VIA_VAAPI:
+        {
+            std::unique_ptr<VPLVAAPIAccelerationPolicy> cand(new VPLVAAPIAccelerationPolicy(selector));
+            ret = std::move(cand);
+            break;
+        }
         case MFX_ACCEL_MODE_NA:
         {
             std::unique_ptr<VPLCPUAccelerationPolicy> cand(new VPLCPUAccelerationPolicy(selector));
diff --git a/modules/gapi/src/streaming/onevpl/utils.cpp b/modules/gapi/src/streaming/onevpl/utils.cpp
index 549ff597cf..efd1618b71 100644
--- a/modules/gapi/src/streaming/onevpl/utils.cpp
+++ b/modules/gapi/src/streaming/onevpl/utils.cpp
@@ -357,7 +357,7 @@ std::string mfxstatus_to_string(mfxStatus err) {
     }
 
     std::string ret("<unknown ");
-    ret += std::to_string(err) + ">";
+    ret += std::to_string(static_cast<size_t>(err)) + ">";
     return ret;
 }
 
@@ -384,7 +384,7 @@ std::string mfx_frame_info_to_string(const mfxFrameInfo &info) {
     return ss.str();
 }
 
-int compare(const mfxFrameInfo &lhs, const mfxFrameInfo &rhs) {
+static int compare(const mfxFrameInfo &lhs, const mfxFrameInfo &rhs) {
     //NB: mfxFrameInfo is a `packed` struct declared in VPL
     return memcmp(&lhs, &rhs, sizeof(mfxFrameInfo));
 }
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index ffa1d452c1..5cac4df845 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -2440,7 +2440,11 @@ TEST(OneVPL_Source, Init)
 
     std::vector<CfgParam> src_params;
     src_params.push_back(CfgParam::create_implementation(MFX_IMPL_TYPE_HARDWARE));
+#ifdef __WIN32
     src_params.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
+#elif defined(__linux__)
+    src_params.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));
+#endif
     src_params.push_back(CfgParam::create_decoder_id(MFX_CODEC_HEVC));
     std::stringstream stream(std::ios_base::in | std::ios_base::out | std::ios_base::binary);
 
diff --git a/modules/gapi/test/streaming/gapi_streaming_vpl_core_test.cpp b/modules/gapi/test/streaming/gapi_streaming_vpl_core_test.cpp
index 59723dbd5e..d83b09d7d3 100644
--- a/modules/gapi/test/streaming/gapi_streaming_vpl_core_test.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_vpl_core_test.cpp
@@ -36,6 +36,7 @@
 #include "streaming/onevpl/accelerators/surface/cpu_frame_adapter.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_dx11.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
 #include "streaming/onevpl/accelerators/dx11_alloc_resource.hpp"
 #include "streaming/onevpl/accelerators/utils/shared_lock.hpp"
 #define private public
@@ -79,7 +80,7 @@ struct TestProcessingSession : public cv::gapi::wip::onevpl::EngineSession {
 
 struct TestProcessingEngine: public cv::gapi::wip::onevpl::ProcessingEngineBase {
 
-    size_t pipeline_stage_num = 0;
+    int pipeline_stage_num = 0;
 
     TestProcessingEngine(std::unique_ptr<cv::gapi::wip::onevpl::VPLAccelerationPolicy>&& accel) :
         cv::gapi::wip::onevpl::ProcessingEngineBase(std::move(accel)) {
@@ -154,7 +155,7 @@ private:
     mfxFrameAllocator m_allocator;
 };
 template <class LockProcessor, class UnlockProcessor>
-std::map<mfxMemId, UnlockProcessor> TestLockableAllocator<LockProcessor, UnlockProcessor>::lock_processor_table {};
+std::map<mfxMemId, LockProcessor> TestLockableAllocator<LockProcessor, UnlockProcessor>::lock_processor_table {};
 
 template <class LockProcessor, class UnlockProcessor>
 std::map<mfxMemId, UnlockProcessor> TestLockableAllocator<LockProcessor, UnlockProcessor>::unlock_processor_table {};
@@ -194,11 +195,11 @@ TEST(OneVPL_Source_Surface, InitSurface)
     // check self consistency
     EXPECT_EQ(reinterpret_cast<void*>(surf->get_handle()),
               reinterpret_cast<void*>(mfx_core_handle));
-    EXPECT_EQ(0, surf->get_locks_count());
-    EXPECT_EQ(0, surf->obtain_lock());
-    EXPECT_EQ(1, surf->get_locks_count());
-    EXPECT_EQ(1, surf->release_lock());
-    EXPECT_EQ(0, surf->get_locks_count());
+    EXPECT_TRUE(0 == surf->get_locks_count());
+    EXPECT_TRUE(0 == surf->obtain_lock());
+    EXPECT_TRUE(1 == surf->get_locks_count());
+    EXPECT_TRUE(1 == surf->release_lock());
+    EXPECT_TRUE(0 == surf->get_locks_count());
 }
 
 TEST(OneVPL_Source_Surface, ConcurrentLock)
@@ -213,7 +214,7 @@ TEST(OneVPL_Source_Surface, ConcurrentLock)
     auto surf = Surface::create_surface(std::move(handle), associated_memory);
 
     // check self consistency
-    EXPECT_EQ(0, surf->get_locks_count());
+    EXPECT_TRUE(0 == surf->get_locks_count());
 
     // MFX internal limitation: do not exceede U16 range
     // so I16 is using here
@@ -238,7 +239,7 @@ TEST(OneVPL_Source_Surface, ConcurrentLock)
     }
 
     worker_thread.join();
-    EXPECT_EQ(lock_counter * 2, surf->get_locks_count());
+    EXPECT_TRUE(static_cast<size_t>(lock_counter * 2) == surf->get_locks_count());
 }
 
 TEST(OneVPL_Source_Surface, MemoryLifeTime)
@@ -271,7 +272,7 @@ TEST(OneVPL_Source_Surface, MemoryLifeTime)
     }
 
     // workspace memory must be alive
-    EXPECT_EQ(0, surfaces.size());
+    EXPECT_TRUE(0 == surfaces.size());
     EXPECT_TRUE(associated_memory != nullptr);
     EXPECT_TRUE(preallocated_memory_ptr.get() != nullptr);
 
@@ -293,7 +294,7 @@ TEST(OneVPL_Source_Surface, MemoryLifeTime)
     associated_memory.reset();
 
     // workspace memory must be still alive
-    EXPECT_EQ(0, surfaces.size());
+    EXPECT_TRUE(0 == surfaces.size());
     EXPECT_TRUE(associated_memory == nullptr);
     EXPECT_TRUE(preallocated_memory_ptr.get() != nullptr);
 
@@ -316,14 +317,14 @@ TEST(OneVPL_Source_CPU_FrameAdapter, InitFrameAdapter)
     auto surf = Surface::create_surface(std::move(handle), associated_memory);
 
     // check consistency
-    EXPECT_EQ(0, surf->get_locks_count());
+    EXPECT_TRUE(0 == surf->get_locks_count());
 
     {
         mfxSession stub_session = reinterpret_cast<mfxSession>(0x1);
         VPLMediaFrameCPUAdapter adapter(surf, stub_session);
-        EXPECT_EQ(1, surf->get_locks_count());
+        EXPECT_TRUE(1 == surf->get_locks_count());
     }
-    EXPECT_EQ(0, surf->get_locks_count());
+    EXPECT_TRUE(0 == surf->get_locks_count());
 }
 
 TEST(OneVPL_Source_CPU_Accelerator, InitDestroy)
@@ -385,13 +386,13 @@ TEST(OneVPL_Source_CPU_Accelerator, PoolProduceConsume)
     for (size_t i = 0; i < surface_count; i++) {
         std::shared_ptr<Surface> surf = acceleration_policy->get_free_surface(key).lock();
         EXPECT_TRUE(surf.get() != nullptr);
-        EXPECT_EQ(0, surf->obtain_lock());
+        EXPECT_TRUE(0 == surf->obtain_lock());
         surfaces.push_back(std::move(surf));
     }
 
     // check consistency (no free surfaces)
     EXPECT_EQ(acceleration_policy->get_surface_count(key), surface_count);
-    EXPECT_EQ(0, acceleration_policy->get_free_surface_count(key));
+    EXPECT_TRUE(0 == acceleration_policy->get_free_surface_count(key));
 
     // fail consume non-free surfaces
     for (size_t i = 0; i < surface_count; i++) {
@@ -400,7 +401,7 @@ TEST(OneVPL_Source_CPU_Accelerator, PoolProduceConsume)
 
     // release surfaces
     for (auto& surf : surfaces) {
-        EXPECT_EQ(1, surf->release_lock());
+        EXPECT_TRUE(1 == surf->release_lock());
     }
     surfaces.clear();
 
@@ -412,7 +413,7 @@ TEST(OneVPL_Source_CPU_Accelerator, PoolProduceConsume)
     for (size_t i = 0; i < surface_count; i++) {
         std::shared_ptr<Surface> surf = acceleration_policy->get_free_surface(key).lock();
         EXPECT_TRUE(surf.get() != nullptr);
-        EXPECT_EQ(0, surf->obtain_lock());
+        EXPECT_TRUE(0 == surf->obtain_lock());
     }
 }
 
@@ -444,7 +445,7 @@ TEST(OneVPL_Source_CPU_Accelerator, PoolProduceConcurrentConsume)
     for (size_t i = 0; i < surface_count; i++) {
         std::shared_ptr<Surface> surf = acceleration_policy->get_free_surface(key).lock();
         EXPECT_TRUE(surf.get() != nullptr);
-        EXPECT_EQ(0, surf->obtain_lock());
+        EXPECT_TRUE(0 == surf->obtain_lock());
         surfaces.push_back(std::move(surf));
     }
 
@@ -458,7 +459,7 @@ TEST(OneVPL_Source_CPU_Accelerator, PoolProduceConcurrentConsume)
         // concurrent release surfaces
         size_t surfaces_count = surfaces.size();
         for (auto& surf : surfaces) {
-            EXPECT_EQ(1, surf->release_lock());
+            EXPECT_TRUE(1 == surf->release_lock());
             std::this_thread::sleep_for(std::chrono::seconds(1));
         }
         surfaces.clear();
@@ -492,7 +493,7 @@ TEST(OneVPL_Source_ProcessingEngine, Init)
     mfxSession mfx_session{};
     engine.initialize_session(mfx_session, {}, std::shared_ptr<IDataProvider>{});
 
-    EXPECT_EQ(0, engine.get_ready_frames_count());
+    EXPECT_TRUE(0 == engine.get_ready_frames_count());
     ProcessingEngineBase::ExecutionStatus ret = engine.process(mfx_session);
     EXPECT_EQ(ret, ProcessingEngineBase::ExecutionStatus::Continue);
     EXPECT_EQ(0, engine.pipeline_stage_num);
@@ -508,12 +509,12 @@ TEST(OneVPL_Source_ProcessingEngine, Init)
     ret = engine.process(mfx_session);
     EXPECT_EQ(ret, ProcessingEngineBase::ExecutionStatus::Processed);
     EXPECT_EQ(3, engine.pipeline_stage_num);
-    EXPECT_EQ(1, engine.get_ready_frames_count());
+    EXPECT_TRUE(1 == engine.get_ready_frames_count());
 
     ret = engine.process(mfx_session);
     EXPECT_EQ(ret, ProcessingEngineBase::ExecutionStatus::SessionNotFound);
     EXPECT_EQ(3, engine.pipeline_stage_num);
-    EXPECT_EQ(1, engine.get_ready_frames_count());
+    EXPECT_TRUE(1 == engine.get_ready_frames_count());
 
     cv::gapi::wip::Data frame;
     engine.get_frame(frame);
@@ -603,7 +604,98 @@ TEST(OneVPL_Source_DX11_Accel, Init)
     MFXClose(mfx_session);
     MFXUnload(test_mfx_handle);
 }
+#endif // HAVE_DIRECTX
+#endif // HAVE_D3D11
+
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+TEST(OneVPL_Source_VAAPI_Accel, Init)
+{
+    using namespace cv::gapi::wip::onevpl;
+
+    std::vector<CfgParam> cfg_params_w_vaapi;
+    cfg_params_w_vaapi.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));
+    VPLVAAPIAccelerationPolicy accel(std::make_shared<CfgParamDeviceSelector>(cfg_params_w_vaapi));
+
+    mfxLoader test_mfx_handle = MFXLoad();
+
+    mfxConfig cfg_inst_0 = MFXCreateConfig(test_mfx_handle);
+    EXPECT_TRUE(cfg_inst_0);
+    mfxVariant mfx_param_0;
+    mfx_param_0.Type = MFX_VARIANT_TYPE_U32;
+    mfx_param_0.Data.U32 = MFX_IMPL_TYPE_HARDWARE;
+    EXPECT_EQ(MFXSetConfigFilterProperty(cfg_inst_0,(mfxU8 *)CfgParam::implementation_name(),
+                                                    mfx_param_0), MFX_ERR_NONE);
 
+    mfxConfig cfg_inst_1 = MFXCreateConfig(test_mfx_handle);
+    EXPECT_TRUE(cfg_inst_1);
+    mfxVariant mfx_param_1;
+    mfx_param_1.Type = MFX_VARIANT_TYPE_U32;
+    mfx_param_1.Data.U32 = MFX_ACCEL_MODE_VIA_VAAPI;
+    EXPECT_EQ(MFXSetConfigFilterProperty(cfg_inst_1,(mfxU8 *)CfgParam::acceleration_mode_name(),
+                                                    mfx_param_1), MFX_ERR_NONE);
+
+    mfxConfig cfg_inst_2 = MFXCreateConfig(test_mfx_handle);
+    EXPECT_TRUE(cfg_inst_2);
+    mfxVariant mfx_param_2;
+    mfx_param_2.Type = MFX_VARIANT_TYPE_U32;
+    mfx_param_2.Data.U32 = MFX_CODEC_HEVC;
+    EXPECT_EQ(MFXSetConfigFilterProperty(cfg_inst_2,(mfxU8 *)CfgParam::decoder_id_name(),
+                                                    mfx_param_2), MFX_ERR_NONE);
+
+    // create session
+    mfxSession mfx_session{};
+    mfxStatus sts = MFXCreateSession(test_mfx_handle, 0, &mfx_session);
+    EXPECT_EQ(MFX_ERR_NONE, sts);
+
+    // assign acceleration
+    EXPECT_NO_THROW(accel.init(mfx_session));
+
+    // create proper bitstream
+    mfxBitstream bitstream{};
+    const int BITSTREAM_BUFFER_SIZE = 2000000;
+    bitstream.MaxLength = BITSTREAM_BUFFER_SIZE;
+    bitstream.Data = (mfxU8 *)calloc(bitstream.MaxLength, sizeof(mfxU8));
+    EXPECT_TRUE(bitstream.Data);
+
+    // simulate read stream
+    bitstream.DataOffset = 0;
+    bitstream.DataLength = sizeof(streaming::onevpl::hevc_header) * sizeof(streaming::onevpl::hevc_header[0]);
+    memcpy(bitstream.Data, streaming::onevpl::hevc_header, bitstream.DataLength);
+    bitstream.CodecId = MFX_CODEC_HEVC;
+
+    // prepare dec params
+    mfxVideoParam mfxDecParams {};
+    mfxDecParams.mfx.CodecId = bitstream.CodecId;
+    mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
+    sts = MFXVideoDECODE_DecodeHeader(mfx_session, &bitstream, &mfxDecParams);
+    EXPECT_EQ(MFX_ERR_NONE, sts);
+
+    mfxFrameAllocRequest request{};
+    memset(&request, 0, sizeof(request));
+    sts = MFXVideoDECODE_QueryIOSurf(mfx_session, &mfxDecParams, &request);
+    EXPECT_EQ(MFX_ERR_NONE, sts);
+
+    // Allocate surfaces for decoder
+    VPLAccelerationPolicy::pool_key_t key = accel.create_surface_pool(request,
+                                                                      mfxDecParams.mfx.FrameInfo);
+    auto cand_surface = accel.get_free_surface(key).lock();
+
+    sts = MFXVideoDECODE_Init(mfx_session, &mfxDecParams);
+    EXPECT_EQ(MFX_ERR_NONE, sts);
+
+    MFXVideoDECODE_Close(mfx_session);
+    EXPECT_EQ(MFX_ERR_NONE, sts);
+
+    EXPECT_NO_THROW(accel.deinit(mfx_session));
+    MFXClose(mfx_session);
+    MFXUnload(test_mfx_handle);
+}
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // __linux__
+
+#ifdef HAVE_DIRECTX
+#ifdef HAVE_D3D11
 TEST(OneVPL_Source_DX11_Accel_VPL, Init)
 {
     using namespace cv::gapi::wip::onevpl;
@@ -1024,8 +1116,6 @@ TEST(OneVPL_Source_DX11_Accel_VPL, preproc)
         }
     } while(frame_num < min_available_frames_count);
 }
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
 
 TEST(OneVPL_Source_DX11_FrameLockable, LockUnlock_without_Adaptee)
 {
@@ -1114,6 +1204,8 @@ TEST(OneVPL_Source_DX11_FrameLockable, LockUnlock_with_Adaptee)
     EXPECT_EQ(w_lock_counter, exec_count);
     EXPECT_EQ(w_unlock_counter, exec_count);
 }
+#endif // HAVE_DIRECTX
+#endif // HAVE_D3D11
 }
 } // namespace opencv_test
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/test/streaming/gapi_streaming_vpl_data_provider.cpp b/modules/gapi/test/streaming/gapi_streaming_vpl_data_provider.cpp
index ebafb79695..a76d6537f9 100644
--- a/modules/gapi/test/streaming/gapi_streaming_vpl_data_provider.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_vpl_data_provider.cpp
@@ -16,6 +16,7 @@
 #include "streaming/onevpl/demux/async_mfp_demux_data_provider.hpp"
 #include "streaming/onevpl/source_priv.hpp"
 
+#ifdef _WIN32
 namespace opencv_test
 {
 namespace
@@ -299,4 +300,5 @@ TEST(OneVPL_Source_MFPAsyncDemux, produce_consume) {
 }
 } // namespace opencv_test
 
+#endif // _WIN32
 #endif // HAVE_ONEVPL
diff --git a/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp b/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
index d484dcec75..ee1be9f433 100644
--- a/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
@@ -69,11 +69,11 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDevice)
     using namespace cv::gapi::wip::onevpl;
     CfgParamDeviceSelector selector;
     IDeviceSelector::DeviceScoreTable devs = selector.select_devices();
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_host_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority);
 
     IDeviceSelector::DeviceContexts ctxs = selector.select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     test_host_ctx_eq(*ctxs.begin());
 }
 
@@ -83,10 +83,10 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithEmptyCfgParam)
     std::vector<CfgParam> empty_params;
     CfgParamDeviceSelector selector(empty_params);
     IDeviceSelector::DeviceScoreTable devs = selector.select_devices();
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_host_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority);
     IDeviceSelector::DeviceContexts ctxs = selector.select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     test_host_ctx_eq(*ctxs.begin());
 }
 
@@ -97,11 +97,11 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithAccelNACfgParam)
     cfg_params_w_no_accel.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_NA));
     CfgParamDeviceSelector selector(cfg_params_w_no_accel);
     IDeviceSelector::DeviceScoreTable devs = selector.select_devices();
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_host_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority);
 
     IDeviceSelector::DeviceContexts ctxs = selector.select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     test_host_ctx_eq(*ctxs.begin());
 }
 
@@ -113,11 +113,11 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithEmptyCfgParam_DX11
     std::vector<CfgParam> empty_params;
     CfgParamDeviceSelector selector(empty_params);
     IDeviceSelector::DeviceScoreTable devs = selector.select_devices();
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_host_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority);
 
     IDeviceSelector::DeviceContexts ctxs = selector.select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     test_host_ctx_eq(*ctxs.begin());
 }
 
@@ -130,13 +130,13 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithDX11AccelCfgParam_
     EXPECT_NO_THROW(selector_ptr.reset(new CfgParamDeviceSelector(cfg_params_w_dx11)));
     IDeviceSelector::DeviceScoreTable devs = selector_ptr->select_devices();
 
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority,
                 AccelType::DX11,
                 std::get<1>(*devs.begin()).get_ptr() /* compare just type */);
 
     IDeviceSelector::DeviceContexts ctxs = selector_ptr->select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     EXPECT_TRUE(ctxs.begin()->get_ptr());
 }
 
@@ -182,12 +182,12 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, ExternalDeviceWithDX11AccelCfgParam
                                                                   cfg_params_w_dx11)));
     IDeviceSelector::DeviceScoreTable devs = selector_ptr->select_devices();
 
-    EXPECT_EQ(devs.size(), 1);
+    EXPECT_TRUE(devs.size() == 1);
     test_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority,
                 AccelType::DX11, device);
 
     IDeviceSelector::DeviceContexts ctxs = selector_ptr->select_context();
-    EXPECT_EQ(ctxs.size(), 1);
+    EXPECT_TRUE(ctxs.size() == 1);
     EXPECT_EQ(reinterpret_cast<ID3D11DeviceContext*>(ctxs.begin()->get_ptr()),
               device_context);
 }
@@ -201,7 +201,7 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DX11DeviceFromCfgParamWithDX11Disab
 {
     using namespace cv::gapi::wip::onevpl;
     std::vector<CfgParam> cfg_params_w_non_existed_dx11;
-    cfg_params_w_not_existed_dx11.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
+    cfg_params_w_non_existed_dx11.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
     EXPECT_THROW(CfgParamDeviceSelector{cfg_params_w_non_existed_dx11},
                  std::logic_error);
 }
diff --git a/modules/gapi/test/streaming/gapi_streaming_vpp_preproc_test.cpp b/modules/gapi/test/streaming/gapi_streaming_vpp_preproc_test.cpp
index f33d967c89..9da9221c8f 100644
--- a/modules/gapi/test/streaming/gapi_streaming_vpp_preproc_test.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_vpp_preproc_test.cpp
@@ -40,6 +40,7 @@
 #include "streaming/onevpl/accelerators/surface/dx11_frame_adapter.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_dx11.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
 #include "streaming/onevpl/accelerators/dx11_alloc_resource.hpp"
 #include "streaming/onevpl/accelerators/utils/shared_lock.hpp"
 #define private public
@@ -120,6 +121,28 @@ std::tuple<mfxLoader, mfxConfig> prepare_mfx(int mfx_codec, int mfx_accel_mode)
     return std::make_tuple(mfx, cfg_inst_3);
 }
 
+static std::unique_ptr<cv::gapi::wip::onevpl::VPLAccelerationPolicy>
+create_accel_policy_from_int(int accel,
+                             std::shared_ptr<cv::gapi::wip::onevpl::IDeviceSelector> selector) {
+    using namespace cv::gapi::wip::onevpl;
+    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy;
+    if (accel == MFX_ACCEL_MODE_VIA_D3D11) {
+        decode_accel_policy.reset (new VPLDX11AccelerationPolicy(selector));
+    } else if (accel == MFX_ACCEL_MODE_VIA_VAAPI) {
+        decode_accel_policy.reset (new VPLVAAPIAccelerationPolicy(selector));
+    }
+    EXPECT_TRUE(decode_accel_policy.get());
+    return decode_accel_policy;
+}
+
+static std::unique_ptr<cv::gapi::wip::onevpl::VPLAccelerationPolicy>
+create_accel_policy_from_int(int &accel,
+                             std::vector<cv::gapi::wip::onevpl::CfgParam> &out_cfg_params) {
+    using namespace cv::gapi::wip::onevpl;
+    out_cfg_params.push_back(CfgParam::create_acceleration_mode(accel));
+    return create_accel_policy_from_int(accel, std::make_shared<CfgParamDeviceSelector>(out_cfg_params));
+}
+
 class SafeQueue {
 public:
     void push(cv::MediaFrame&& f) {
@@ -186,26 +209,32 @@ static cv::util::optional<cv::Rect> empty_roi;
 
 class VPPPreprocParams : public ::testing::TestWithParam<preproc_args_t> {};
 
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
+    #define UT_ACCEL_TYPE MFX_ACCEL_MODE_VIA_D3D11
+#elif __linux__
+    #define UT_ACCEL_TYPE MFX_ACCEL_MODE_VIA_VAAPI
+#else
+    #define UT_ACCEL_TYPE -1
+#endif
+
 preproc_args_t files[] = {
     preproc_args_t {"highgui/video/big_buck_bunny.h264",
-                    MFX_CODEC_AVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_AVC,     UT_ACCEL_TYPE,
                     cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1080}}},
     preproc_args_t {"highgui/video/big_buck_bunny.h265",
-                    MFX_CODEC_HEVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_HEVC,     UT_ACCEL_TYPE,
                     cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1280}}}
 };
 
-#ifdef HAVE_DIRECTX
-#ifdef HAVE_D3D11
-TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
+class OneVPL_PreproEngineTest : public ::testing::TestWithParam<acceleration_t> {};
+TEST_P(OneVPL_PreproEngineTest, functional_single_thread)
 {
     using namespace cv::gapi::wip::onevpl;
     using namespace cv::gapi::wip;
 
-    std::vector<CfgParam> cfg_params_w_dx11;
-    cfg_params_w_dx11.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
-    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy (
-                    new VPLDX11AccelerationPolicy(std::make_shared<CfgParamDeviceSelector>(cfg_params_w_dx11)));
+    int accel_type = GetParam();
+    std::vector<CfgParam> cfg_params_w_accel;
+    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy = create_accel_policy_from_int(accel_type, cfg_params_w_accel);
 
     // create file data provider
     std::string file_path = findDataFile("highgui/video/big_buck_bunny.h265");
@@ -214,7 +243,7 @@ TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
 
     mfxLoader mfx{};
     mfxConfig mfx_cfg{};
-    std::tie(mfx, mfx_cfg) = prepare_mfx(MFX_CODEC_HEVC, MFX_ACCEL_MODE_VIA_D3D11);
+    std::tie(mfx, mfx_cfg) = prepare_mfx(MFX_CODEC_HEVC, accel_type);
 
     // create decode session
     mfxSession mfx_decode_session{};
@@ -225,7 +254,7 @@ TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
     auto device_selector = decode_accel_policy->get_device_selector();
     VPLLegacyDecodeEngine decode_engine(std::move(decode_accel_policy));
     auto sess_ptr = decode_engine.initialize_session(mfx_decode_session,
-                                                     cfg_params_w_dx11,
+                                                     cfg_params_w_accel,
                                                      data_provider);
 
     // simulate net info
@@ -233,8 +262,7 @@ TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
                                          {1920, 1080}};
 
     // create VPP preproc engine
-    VPPPreprocEngine preproc_engine(std::unique_ptr<VPLAccelerationPolicy>{
-                                                new VPLDX11AccelerationPolicy(device_selector)});
+    VPPPreprocEngine preproc_engine(create_accel_policy_from_int(accel_type, device_selector));
 
     // launch pipeline
     // 1) decode frame
@@ -261,7 +289,7 @@ TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
 
     // make test in loop
     bool in_progress = false;
-    size_t frames_processed_count = 1;
+    int frames_processed_count = 1;
     const auto &first_pp_param_value_impl =
         cv::util::get<cv::gapi::wip::onevpl::vpp_pp_params>(first_pp_params.value().value);
     try {
@@ -298,9 +326,12 @@ TEST(OneVPL_Source_PreprocEngine, functional_single_thread)
     ASSERT_NE(frames_processed_count, 1);
 }
 
-void decode_function(cv::gapi::wip::onevpl::VPLLegacyDecodeEngine &decode_engine,
-                     cv::gapi::wip::onevpl::ProcessingEngineBase::session_ptr sess_ptr,
-                     SafeQueue &queue, size_t &decoded_number) {
+INSTANTIATE_TEST_CASE_P(OneVPL_Source_PreprocEngine, OneVPL_PreproEngineTest,
+                        testing::Values(UT_ACCEL_TYPE));
+
+static void decode_function(cv::gapi::wip::onevpl::VPLLegacyDecodeEngine &decode_engine,
+                            cv::gapi::wip::onevpl::ProcessingEngineBase::session_ptr sess_ptr,
+                            SafeQueue &queue, int &decoded_number) {
     // decode first frame
     {
         cv::MediaFrame decoded_frame;
@@ -320,9 +351,9 @@ void decode_function(cv::gapi::wip::onevpl::VPLLegacyDecodeEngine &decode_engine
     queue.push_stop();
 }
 
-void preproc_function(cv::gapi::wip::IPreprocEngine &preproc_engine, SafeQueue&queue,
-                      size_t &preproc_number, const out_frame_info_t &required_frame_param,
-                      const cv::util::optional<cv::Rect> &roi_rect = {}) {
+static void preproc_function(cv::gapi::wip::IPreprocEngine &preproc_engine, SafeQueue&queue,
+                             int &preproc_number, const out_frame_info_t &required_frame_param,
+                             const cv::util::optional<cv::Rect> &roi_rect = {}) {
     using namespace cv::gapi::wip;
     using namespace cv::gapi::wip::onevpl;
     // create preproc session based on frame description & network info
@@ -385,10 +416,11 @@ void preproc_function(cv::gapi::wip::IPreprocEngine &preproc_engine, SafeQueue&q
     ASSERT_NE(preproc_number, 1);
 }
 
-void multi_source_preproc_function(size_t source_num,
-                                   cv::gapi::wip::IPreprocEngine &preproc_engine, SafeQueue&queue,
-                                   size_t &preproc_number, const out_frame_info_t &required_frame_param,
-                                   const cv::util::optional<cv::Rect> &roi_rect = {}) {
+#ifdef __WIN32__
+static void multi_source_preproc_function(size_t source_num,
+                                          cv::gapi::wip::IPreprocEngine &preproc_engine, SafeQueue&queue,
+                                          int &preproc_number, const out_frame_info_t &required_frame_param,
+                                          const cv::util::optional<cv::Rect> &roi_rect = {}) {
     using namespace cv::gapi::wip;
     using namespace cv::gapi::wip::onevpl;
     // create preproc session based on frame description & network info
@@ -450,6 +482,8 @@ void multi_source_preproc_function(size_t source_num,
     ASSERT_FALSE(in_progress);
     ASSERT_NE(preproc_number, 1);
 }
+#endif // __WIN32__
+
 using roi_t = cv::util::optional<cv::Rect>;
 using preproc_roi_args_t = decltype(std::tuple_cat(std::declval<preproc_args_t>(),
                                                    std::declval<std::tuple<roi_t>>()));
@@ -467,10 +501,8 @@ TEST_P(VPPPreprocROIParams, functional_roi_different_threads)
 
     file_path = findDataFile(file_path);
 
-    std::vector<CfgParam> cfg_params_w_dx11;
-    cfg_params_w_dx11.push_back(CfgParam::create_acceleration_mode(accel));
-    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy (
-                    new VPLDX11AccelerationPolicy(std::make_shared<CfgParamDeviceSelector>(cfg_params_w_dx11)));
+    std::vector<CfgParam> cfg_params_w_accel;
+    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy = create_accel_policy_from_int(accel, cfg_params_w_accel);
 
     // create file data provider
     std::shared_ptr<IDataProvider> data_provider(new FileDataProvider(file_path,
@@ -489,17 +521,16 @@ TEST_P(VPPPreprocROIParams, functional_roi_different_threads)
     auto device_selector = decode_accel_policy->get_device_selector();
     VPLLegacyDecodeEngine decode_engine(std::move(decode_accel_policy));
     auto sess_ptr = decode_engine.initialize_session(mfx_decode_session,
-                                                     cfg_params_w_dx11,
+                                                     cfg_params_w_accel,
                                                      data_provider);
 
     // create VPP preproc engine
-    VPPPreprocEngine preproc_engine(std::unique_ptr<VPLAccelerationPolicy>{
-                                                new VPLDX11AccelerationPolicy(device_selector)});
+    VPPPreprocEngine preproc_engine(create_accel_policy_from_int(accel, device_selector));
 
     // launch threads
     SafeQueue queue;
-    size_t decoded_number = 1;
-    size_t preproc_number = 0;
+    int decoded_number = 1;
+    int preproc_number = 0;
 
     std::thread decode_thread(decode_function, std::ref(decode_engine), sess_ptr,
                               std::ref(queue), std::ref(decoded_number));
@@ -515,31 +546,31 @@ TEST_P(VPPPreprocROIParams, functional_roi_different_threads)
 
 preproc_roi_args_t files_w_roi[] = {
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h264",
-                    MFX_CODEC_AVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_AVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1080}}},
                     roi_t{cv::Rect{0,0,50,50}}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h264",
-                    MFX_CODEC_AVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_AVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1080}}},
                     roi_t{}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h264",
-                    MFX_CODEC_AVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_AVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1080}}},
                     roi_t{cv::Rect{0,0,100,100}}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h264",
-                    MFX_CODEC_AVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_AVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1080}}},
                     roi_t{cv::Rect{100,100,200,200}}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h265",
-                    MFX_CODEC_HEVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_HEVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1280}}},
                     roi_t{cv::Rect{0,0,100,100}}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h265",
-                    MFX_CODEC_HEVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_HEVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1280}}},
                     roi_t{}},
     preproc_roi_args_t {"highgui/video/big_buck_bunny.h265",
-                    MFX_CODEC_HEVC,     MFX_ACCEL_MODE_VIA_D3D11,
+                    MFX_CODEC_HEVC,     UT_ACCEL_TYPE,
                     out_frame_info_t{cv::GFrameDesc {cv::MediaFormat::NV12, {1920, 1280}}},
                     roi_t{cv::Rect{100,100,200,200}}}
 };
@@ -561,12 +592,10 @@ TEST_P(VPPInnerPreprocParams, functional_inner_preproc_size)
 
     file_path = findDataFile(file_path);
 
-    std::vector<CfgParam> cfg_params_w_dx11_vpp;
+    std::vector<CfgParam> cfg_params_w_accel_vpp;
 
     // create accel policy
-    cfg_params_w_dx11_vpp.push_back(CfgParam::create_acceleration_mode(accel));
-    std::unique_ptr<VPLAccelerationPolicy> accel_policy (
-                    new VPLDX11AccelerationPolicy(std::make_shared<CfgParamDeviceSelector>(cfg_params_w_dx11_vpp)));
+    std::unique_ptr<VPLAccelerationPolicy> accel_policy = create_accel_policy_from_int(accel, cfg_params_w_accel_vpp);
 
     // create file data provider
     std::shared_ptr<IDataProvider> data_provider(new FileDataProvider(file_path,
@@ -582,20 +611,20 @@ TEST_P(VPPInnerPreprocParams, functional_inner_preproc_size)
     EXPECT_EQ(MFX_ERR_NONE, sts);
 
     // fill vpp params beforehand: resolution
-    cfg_params_w_dx11_vpp.push_back(CfgParam::create_vpp_out_width(
+    cfg_params_w_accel_vpp.push_back(CfgParam::create_vpp_out_width(
                                         static_cast<uint16_t>(required_frame_param.size.width)));
-    cfg_params_w_dx11_vpp.push_back(CfgParam::create_vpp_out_height(
+    cfg_params_w_accel_vpp.push_back(CfgParam::create_vpp_out_height(
                                         static_cast<uint16_t>(required_frame_param.size.height)));
 
     // create transcode engine
     auto device_selector = accel_policy->get_device_selector();
     VPLLegacyTranscodeEngine engine(std::move(accel_policy));
     auto sess_ptr = engine.initialize_session(mfx_decode_session,
-                                              cfg_params_w_dx11_vpp,
+                                              cfg_params_w_accel_vpp,
                                               data_provider);
    // make test in loop
     bool in_progress = false;
-    size_t frames_processed_count = 1;
+    int frames_processed_count = 1;
     try {
         while(true) {
             cv::MediaFrame decoded_frame = extract_decoded_frame(sess_ptr->session, engine);
@@ -618,7 +647,8 @@ TEST_P(VPPInnerPreprocParams, functional_inner_preproc_size)
 INSTANTIATE_TEST_CASE_P(OneVPL_Source_PreprocInner, VPPInnerPreprocParams,
                         testing::ValuesIn(files));
 
-// Dispatcher test suite
+// enable only for WIN32 because there are not CPU processing on Linux by default
+#ifdef __WIN32__
 class VPPPreprocDispatcherROIParams : public ::testing::TestWithParam<preproc_roi_args_t> {};
 TEST_P(VPPPreprocDispatcherROIParams, functional_roi_different_threads)
 {
@@ -626,17 +656,15 @@ TEST_P(VPPPreprocDispatcherROIParams, functional_roi_different_threads)
     using namespace cv::gapi::wip::onevpl;
     source_t file_path;
     decoder_t decoder_id;
-    acceleration_t accel = MFX_ACCEL_MODE_VIA_D3D11;
+    acceleration_t accel = 0;
     out_frame_info_t required_frame_param;
     roi_t opt_roi;
-    std::tie(file_path, decoder_id, std::ignore, required_frame_param, opt_roi) = GetParam();
+    std::tie(file_path, decoder_id, accel, required_frame_param, opt_roi) = GetParam();
 
     file_path = findDataFile(file_path);
 
-    std::vector<CfgParam> cfg_params_w_dx11;
-    cfg_params_w_dx11.push_back(CfgParam::create_acceleration_mode(accel));
-    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy (
-                    new VPLDX11AccelerationPolicy(std::make_shared<CfgParamDeviceSelector>(cfg_params_w_dx11)));
+    std::vector<CfgParam> cfg_params_w_accel;
+    std::unique_ptr<VPLAccelerationPolicy> decode_accel_policy = create_accel_policy_from_int(accel, cfg_params_w_accel);
 
     // create file data provider
     std::shared_ptr<IDataProvider> data_provider(new FileDataProvider(file_path,
@@ -661,7 +689,7 @@ TEST_P(VPPPreprocDispatcherROIParams, functional_roi_different_threads)
     auto device_selector = decode_accel_policy->get_device_selector();
     VPLLegacyDecodeEngine decode_engine(std::move(decode_accel_policy));
     auto sess_ptr = decode_engine.initialize_session(mfx_decode_session,
-                                                     cfg_params_w_dx11,
+                                                     cfg_params_w_accel,
                                                      data_provider);
     std::vector<CfgParam> cfg_params_cpu;
     auto cpu_device_selector = std::make_shared<CfgParamDeviceSelector>(cfg_params_cpu);
@@ -673,16 +701,15 @@ TEST_P(VPPPreprocDispatcherROIParams, functional_roi_different_threads)
 
     // create VPP preproc engines
     VPPPreprocDispatcher preproc_dispatcher;
-    preproc_dispatcher.insert_worker<VPPPreprocEngine>(std::unique_ptr<VPLAccelerationPolicy>{
-                                                            new VPLDX11AccelerationPolicy(device_selector)});
+    preproc_dispatcher.insert_worker<VPPPreprocEngine>(create_accel_policy_from_int(accel, device_selector));
     preproc_dispatcher.insert_worker<VPPPreprocEngine>(std::unique_ptr<VPLAccelerationPolicy>{
                                                             new VPLCPUAccelerationPolicy(cpu_device_selector)});
 
     // launch threads
     SafeQueue queue;
-    size_t decoded_number = 1;
-    size_t cpu_decoded_number = 1;
-    size_t preproc_number = 0;
+    int decoded_number = 1;
+    int cpu_decoded_number = 1;
+    int preproc_number = 0;
 
     std::thread decode_thread(decode_function, std::ref(decode_engine), sess_ptr,
                               std::ref(queue), std::ref(decoded_number));
@@ -704,7 +731,6 @@ TEST_P(VPPPreprocDispatcherROIParams, functional_roi_different_threads)
 INSTANTIATE_TEST_CASE_P(OneVPL_Source_PreprocDispatcherROI, VPPPreprocDispatcherROIParams,
                         testing::ValuesIn(files_w_roi));
 
-#endif // HAVE_DIRECTX
-#endif // HAVE_D3D11
+#endif // __WIN32__
 } // namespace opencv_test
 #endif // HAVE_ONEVPL

From 8d0dae4cec928c67145574d5865d9ac4eb346b55 Mon Sep 17 00:00:00 2001
From: Chen Jun <chen.junn@outlook.com>
Date: Wed, 27 Apr 2022 18:13:57 +0800
Subject: [PATCH 026/178] Fix KalmanFilter comment error

If there will be measurement before the next predict, `statePost` would be assigned to updated value. So I guess these steps are meant to handle when no measurement and KF only do the predict step.
```cpp
statePre.copyTo(statePost);
errorCovPre.copyTo(errorCovPost);
```
---
 modules/video/src/kalman.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/video/src/kalman.cpp b/modules/video/src/kalman.cpp
index f90f9f7b9e..10e7ffbe02 100644
--- a/modules/video/src/kalman.cpp
+++ b/modules/video/src/kalman.cpp
@@ -96,7 +96,7 @@ const Mat& KalmanFilter::predict(const Mat& control)
     // P'(k) = temp1*At + Q
     gemm(temp1, transitionMatrix, 1, processNoiseCov, 1, errorCovPre, GEMM_2_T);
 
-    // handle the case when there will be measurement before the next predict.
+    // handle the case when there will be no measurement before the next predict.
     statePre.copyTo(statePost);
     errorCovPre.copyTo(errorCovPost);
 

From a3f81b79ed5528494a8cf5a007e57290a9c900b8 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <76472231+asenyaev@users.noreply.github.com>
Date: Thu, 12 May 2022 11:52:23 +0300
Subject: [PATCH 027/178] Merge pull request #21917 from
 asenyaev:asen/self_hosted_runner_linux_3.4

Workflow to use a self-hosted Github runner on Linux x86_64 for 3.4 branch

* Changed a workflow to use a self-hosted Github runner for 3.4 branch
---
 .github/workflows/PR-3.4-U20.yaml | 39 ++++++++++++++++---------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
index 2446a07478..e400962379 100644
--- a/.github/workflows/PR-3.4-U20.yaml
+++ b/.github/workflows/PR-3.4-U20.yaml
@@ -6,24 +6,29 @@ on:
       - 3.4
 
 env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DENABLE_CCACHE=OFF'
-  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
-  OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
+  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DOPENCV_DOWNLOAD_PATH=/home/ci/binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON'
   PR_AUTHOR: ${{ github.event.pull_request.user.login }}
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: '/usr/share/ant'
+  ANT_HOME: /usr/share/ant
+  GIT_CACHE_DOCKER: /home/ci/git_cache
   PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
+  OPENCV_TEST_DATA_PATH: /opencv_extra/testdata
+  OPENCV_DOCKER_WORKDIR: /opencv
 
 jobs:
   BuildAndTest:
-    runs-on: ubuntu-20.04
+    runs-on: opencv-cn-lin-x86-64
     defaults:
       run:
         shell: bash
     container:
       image: quay.io/asenyaev/opencv-ubuntu:20.04
+      volumes:
+        - /home/opencv-cn/git_cache:/home/ci/git_cache
+        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
+        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
     steps:
     - name: PR info
       run: |
@@ -31,13 +36,9 @@ jobs:
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
-      run: find . -mindepth 1 -delete
+      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
     - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
+      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ env.OPENCV_DOCKER_WORKDIR }}
@@ -46,7 +47,7 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv_extra.git https://github.com/opencv/opencv_extra.git /opencv_extra
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
@@ -141,12 +142,16 @@ jobs:
       run: cd /opencv-build && cmake --build . --config release --target check_pylint -- -j4
 
   BuildContrib:
-    runs-on: ubuntu-20.04
+    runs-on: opencv-cn-lin-x86-64
     defaults:
       run:
         shell: bash
     container:
       image: quay.io/asenyaev/opencv-ubuntu:20.04
+      volumes:
+        - /home/opencv-cn/git_cache:/home/ci/git_cache
+        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
+        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
     steps:
     - name: PR info
       run: |
@@ -154,13 +159,9 @@ jobs:
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
-      run: find . -mindepth 1 -delete
+      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
     - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
+      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
     - name: Merge opencv with a test branch
       run: |
         cd ${{ env.OPENCV_DOCKER_WORKDIR }}

From 2b67bc448d2b74103a0202b4bfa0342175df8dfb Mon Sep 17 00:00:00 2001
From: Victor <kataev.victor.1234@gmail.com>
Date: Thu, 12 May 2022 11:56:58 +0300
Subject: [PATCH 028/178] Merge pull request #21931 from
 victor1234:calib3d-add-undistortImagePoints

Add undistortImagePoints function

* Add undistortImagePoints function
undistortPoints has unclear interface and additional functionality. New function computes only undistorted image points position

* Add undistortImagePoints test

* Add TermCriteria

* Fix layout
---
 modules/calib3d/include/opencv2/calib3d.hpp   | 16 ++++++++
 modules/calib3d/src/undistort.dispatch.cpp    |  6 +++
 .../calib3d/test/test_undistort_points.cpp    | 38 +++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 5fe7fb9596..ad3527aa42 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -45,6 +45,7 @@
 #define OPENCV_CALIB3D_HPP
 
 #include "opencv2/core.hpp"
+#include "opencv2/core/types.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/core/affine.hpp"
 
@@ -3721,6 +3722,21 @@ void undistortPoints(InputArray src, OutputArray dst,
                      InputArray cameraMatrix, InputArray distCoeffs,
                      InputArray R, InputArray P, TermCriteria criteria);
 
+/**
+ * @brief Compute undistorted image points position
+ *
+ * @param src Observed points position, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel (CV_32FC2 or
+CV_64FC2) (or vector\<Point2f\> ).
+ * @param dst Output undistorted points position (1xN/Nx1 2-channel or vector\<Point2f\> ).
+ * @param cameraMatrix Camera matrix \f$\vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+ * @param distCoeffs Distortion coefficients
+ */
+CV_EXPORTS_W
+void undistortImagePoints(InputArray src, OutputArray dst, InputArray cameraMatrix,
+                          InputArray distCoeffs,
+                          TermCriteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5,
+                                                      0.01));
+
 //! @} calib3d
 
 /** @brief The methods in this namespace use a so-called fisheye camera model.
diff --git a/modules/calib3d/src/undistort.dispatch.cpp b/modules/calib3d/src/undistort.dispatch.cpp
index 146befd955..6c3d32941c 100644
--- a/modules/calib3d/src/undistort.dispatch.cpp
+++ b/modules/calib3d/src/undistort.dispatch.cpp
@@ -40,6 +40,7 @@
 //
 //M*/
 
+#include "opencv2/core/types.hpp"
 #include "precomp.hpp"
 #include "distortion_model.hpp"
 
@@ -607,6 +608,11 @@ void undistortPoints(InputArray _src, OutputArray _dst,
     cvUndistortPointsInternal(&_csrc, &_cdst, &_ccameraMatrix, pD, pR, pP, criteria);
 }
 
+void undistortImagePoints(InputArray src, OutputArray dst, InputArray cameraMatrix, InputArray distCoeffs, TermCriteria termCriteria)
+{
+    undistortPoints(src, dst, cameraMatrix, distCoeffs, noArray(), cameraMatrix, termCriteria);
+}
+
 static Point2f mapPointSpherical(const Point2f& p, float alpha, Vec4d* J, enum UndistortTypes projType)
 {
     double x = p.x, y = p.y;
diff --git a/modules/calib3d/test/test_undistort_points.cpp b/modules/calib3d/test/test_undistort_points.cpp
index 962ff03cca..f92bec068b 100644
--- a/modules/calib3d/test/test_undistort_points.cpp
+++ b/modules/calib3d/test/test_undistort_points.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include <opencv2/ts/cuda_test.hpp> // EXPECT_MAT_NEAR
+#include "opencv2/core/types.hpp"
 #include "test_precomp.hpp"
 
 namespace opencv_test { namespace {
@@ -97,6 +98,43 @@ TEST_F(UndistortPointsTest, accuracy)
     }
 }
 
+TEST_F(UndistortPointsTest, undistortImagePointsAccuracy)
+{
+    Mat intrinsics, distCoeffs;
+    generateCameraMatrix(intrinsics);
+
+    vector<Point3f> points(500);
+    generate3DPointCloud(points);
+
+
+    int modelMembersCount[] = {4,5,8};
+    for (int idx = 0; idx < 3; idx++)
+    {
+        generateDistCoeffs(distCoeffs, modelMembersCount[idx]);
+
+        /* Project points with distortion */
+        vector<Point2f> projectedPoints;
+        projectPoints(Mat(points), Mat::zeros(3,1,CV_64FC1),
+                      Mat::zeros(3,1,CV_64FC1), intrinsics,
+                      distCoeffs, projectedPoints);
+
+        /* Project points without distortion */
+        vector<Point2f> realUndistortedPoints;
+        projectPoints(Mat(points), Mat::zeros(3, 1, CV_64FC1),
+                      Mat::zeros(3,1,CV_64FC1), intrinsics,
+                      Mat::zeros(4,1,CV_64FC1), realUndistortedPoints);
+
+        /* Undistort points */
+        Mat undistortedPoints;
+        TermCriteria termCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, thresh / 2);
+        undistortImagePoints(Mat(projectedPoints), undistortedPoints, intrinsics, distCoeffs,
+                             termCriteria);
+
+        EXPECT_MAT_NEAR(realUndistortedPoints, undistortedPoints.t(), thresh);
+    }
+}
+
+
 TEST_F(UndistortPointsTest, stop_criteria)
 {
     Mat cameraMatrix = (Mat_<double>(3,3,CV_64F) << 857.48296979, 0, 968.06224829,

From 64e1d23cba84dd6770afeadbf78ba92f1a359eec Mon Sep 17 00:00:00 2001
From: Julian Szigethy <70467158+Julian-Sz@users.noreply.github.com>
Date: Tue, 10 May 2022 18:11:27 +0200
Subject: [PATCH 029/178] Fix: Python and Java switched in Explanation

---
 .../hough_circle/hough_circle.markdown        | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
index c4a9989e2b..497215794b 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
@@ -74,11 +74,11 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py load
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java load
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java load
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py load
 @end_toggle
 
 ####  Convert it to grayscale:
@@ -88,11 +88,11 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py convert_to_gray
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java convert_to_gray
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java convert_to_gray
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py convert_to_gray
 @end_toggle
 
 #### Apply a Median blur to reduce noise and avoid false circle detection:
@@ -102,11 +102,11 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py reduce_noise
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java reduce_noise
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java reduce_noise
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py reduce_noise
 @end_toggle
 
 #### Proceed to apply Hough Circle Transform:
@@ -116,11 +116,11 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py houghcircles
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java houghcircles
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java houghcircles
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py houghcircles
 @end_toggle
 
 -   with the arguments:
@@ -144,11 +144,11 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py draw
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java draw
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java draw
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py draw
 @end_toggle
 
 You can see that we will draw the circle(s) on red and the center(s) with a small green dot
@@ -160,11 +160,11 @@ You can see that we will draw the circle(s) on red and the center(s) with a smal
 @end_toggle
 
 @add_toggle_java
-@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py display
+@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java display
 @end_toggle
 
 @add_toggle_python
-@snippet samples/java/tutorial_code/ImgTrans/HoughCircle/HoughCircles.java display
+@snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py display
 @end_toggle
 
 Result

From 2b5b192cd728f2d8719cb672591f13d64bfd9f10 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Fri, 13 May 2022 01:37:55 +0300
Subject: [PATCH 030/178] Moved from https to ssh for Windows in Github Actions

---
 .github/workflows/PR-3.4-W10.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index d965982f5a..ec9614ccd4 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -30,25 +30,25 @@ jobs:
     - name: Clean
       run: cd ${{ github.workspace }} && rm -rf *
     - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git https://github.com/opencv/opencv.git
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git git@github.com:opencv/opencv.git
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ github.workspace }}\opencv
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Fetch opencv_extra
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_extra.git https://github.com/opencv/opencv_extra.git
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_extra.git git@github.com:opencv/opencv_extra.git
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       shell: bash
       run: |
-        RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        RET=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
         if [[ ! -z "$RET" ]]; then
           echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
           cd opencv_extra
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
+          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
         else
           echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
         fi
@@ -145,15 +145,15 @@ jobs:
     - name: Clean
       run: cd ${{ github.workspace }} && rm -rf *
     - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git https://github.com/opencv/opencv.git
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git git@github.com:opencv/opencv.git
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ github.workspace }}\opencv
         git config user.email "opencv.ci"
         git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Fetch opencv_contrib
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_contrib.git --depth 1 https://github.com/opencv/opencv_contrib.git
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
     - name: Configure OpenCV Contrib
       run: |
         mkdir ${{ github.workspace }}\opencv-contrib-build && cd ${{ github.workspace }}\opencv-contrib-build

From 2238ac7d596fa94013eaeb5a1797753b141231ea Mon Sep 17 00:00:00 2001
From: xiongzhen <xiongzh@mail2.sysu.edu.cn>
Date: Fri, 13 May 2022 09:44:25 +0300
Subject: [PATCH 031/178] Apply TIFFTAG_PREDICTOR option for LZW, DEFLATE and
 ADOBE_DEFLATE compression cases for smaller files.

---
 modules/imgcodecs/src/grfmt_tiff.cpp |  2 +-
 modules/imgcodecs/test/test_tiff.cpp | 42 ++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index d6f05f7a65..04df6ff8bb 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -901,7 +901,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
 
         CV_TIFF_CHECK_CALL(TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, depth >= CV_32F ? SAMPLEFORMAT_IEEEFP : SAMPLEFORMAT_UINT));
 
-        if (page_compression != COMPRESSION_NONE)
+        if (page_compression == COMPRESSION_LZW || page_compression == COMPRESSION_ADOBE_DEFLATE || page_compression == COMPRESSION_DEFLATE)
         {
             CV_TIFF_CHECK_CALL(TIFFSetField(tif, TIFFTAG_PREDICTOR, predictor));
         }
diff --git a/modules/imgcodecs/test/test_tiff.cpp b/modules/imgcodecs/test/test_tiff.cpp
index add15ff681..063bd9ae50 100644
--- a/modules/imgcodecs/test/test_tiff.cpp
+++ b/modules/imgcodecs/test/test_tiff.cpp
@@ -219,6 +219,48 @@ TEST(Imgcodecs_Tiff, readWrite_32FC3_RAW)
     EXPECT_EQ(0, remove(filenameOutput.c_str()));
 }
 
+TEST(Imgcodecs_Tiff, readWrite_predictor)
+{
+    /* see issue #21871
+     */
+    const uchar sample_data[160] = {
+        0xff, 0xff, 0xff, 0xff, 0x88, 0x88, 0xff, 0xff, 0x88, 0x88, 0xff, 0xff, 0xff, 0xff, 0xff, 0x88,
+        0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
+        0xff, 0x00, 0x00, 0x44, 0xff, 0xff, 0x88, 0xff, 0x33, 0x00, 0x66, 0xff, 0xff, 0x88, 0x00, 0x44,
+        0x88, 0x00, 0x44, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x44, 0xff, 0xff, 0x11, 0x00, 0xff,
+        0x11, 0x00, 0x88, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0xff,
+        0x11, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x33, 0x00, 0x88, 0xff, 0x00, 0x66, 0xff,
+        0x11, 0x00, 0x66, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x44, 0x33, 0x00, 0xff, 0xff,
+        0x88, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff,
+        0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x33, 0x00, 0x00, 0x66, 0xff, 0xff,
+        0xff, 0xff, 0x88, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff
+    };
+
+    cv::Mat mat(10, 16, CV_8UC1, (void*)sample_data);
+    int methods[] = {
+        COMPRESSION_NONE,     COMPRESSION_LZW,
+        COMPRESSION_PACKBITS, COMPRESSION_DEFLATE,  COMPRESSION_ADOBE_DEFLATE
+    };
+    for (size_t i = 0; i < sizeof(methods) / sizeof(int); i++)
+    {
+        string out = cv::tempfile(".tif");
+
+        std::vector<int> params;
+        params.push_back(TIFFTAG_COMPRESSION);
+        params.push_back(methods[i]);
+        params.push_back(TIFFTAG_PREDICTOR);
+        params.push_back(PREDICTOR_HORIZONTAL);
+
+        EXPECT_NO_THROW(cv::imwrite(out, mat, params));
+
+        const Mat img = cv::imread(out, IMREAD_UNCHANGED);
+        ASSERT_FALSE(img.empty());
+
+        ASSERT_EQ(0, cv::norm(mat, img, cv::NORM_INF));
+
+        EXPECT_EQ(0, remove(out.c_str()));
+    }
+}
 
 //==================================================================================================
 

From 2cc4309bf8399586399c683eb23d95156814d2d8 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@opencv.org.cn>
Date: Sat, 14 May 2022 01:27:01 +0800
Subject: [PATCH 032/178] Merge pull request #21924 from
 fengyuentau:workflow_arm64_3.4

Workflow to build and test on ARM64 for 3.4 branch

* add ARM64 workflow for 3.4

* add opencv_contrib fork check

* rename RET to OPENCV_EXTRA_FORK
---
 .github/workflows/PR-3.4-ARM64.yaml | 189 ++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 .github/workflows/PR-3.4-ARM64.yaml

diff --git a/.github/workflows/PR-3.4-ARM64.yaml b/.github/workflows/PR-3.4-ARM64.yaml
new file mode 100644
index 0000000000..affb2cb9ec
--- /dev/null
+++ b/.github/workflows/PR-3.4-ARM64.yaml
@@ -0,0 +1,189 @@
+name: PR:3.4 ARM64
+
+on:
+  pull_request:
+    branches:
+      - 3.4
+
+env:
+  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DENABLE_CCACHE=OFF'
+  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
+  OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
+  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
+  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
+  TARGET_BRANCH_NAME: ${{ github.base_ref }}
+  ANT_HOME: '/usr/share/ant'
+  PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
+
+jobs:
+  BuildAndTest:
+    runs-on: opencv-cn-lin-arm64
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
+    steps:
+    - name: PR info
+      run: |
+        echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
+        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
+        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
+    - name: Clean
+      run: find . -mindepth 1 -delete
+    - name: Fetch opencv
+      uses: actions/checkout@v3
+      with:
+        repository: opencv/opencv
+        ref: ${{ env.TARGET_BRANCH_NAME }}
+        fetch-depth: 0
+    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
+        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
+        git config user.email "opencv.ci"
+        git config user.name "opencv.ci"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+    - name: Clone opencv_extra
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
+    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
+          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_extra
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
+    - name: Configure OpenCV
+      run: |
+        cmake -G Ninja -B /opencv-build ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
+    - name: Build OpenCV
+      run: |
+        cd /opencv-build
+        ninja
+    - name: Accuracy:calib3d
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_calib3d
+    - name: Accuracy:core
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_core
+    - name: Accuracy:dnn
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_dnn
+    - name: Accuracy:features2d
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_features2d
+    - name: Accuracy:flann
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_flann
+    - name: Accuracy:highgui
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_highgui
+    - name: Accuracy:imgcodecs
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgcodecs
+    - name: Accuracy:imgproc
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgproc
+    - name: Accuracy:ml
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_ml
+    - name: Accuracy:objdetect
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect --gtest_filter="-Objdetect_QRCode_Close.regression/0:Objdetect_QRCode_Close.regression/4"
+    - name: Accuracy:photo
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo --gtest_filter="-Photo_CalibrateDebevec.regression"
+    - name: Accuracy:shape
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_shape
+    - name: Accuracy:stitching
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
+    - name: Accuracy:superres
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_superres
+    - name: Accuracy:video
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
+    - name: Accuracy:videoio
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
+    - name: Accuracy:videostab
+      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videostab
+    - name: Performance:calib3d
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:core
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_core --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:dnn
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_dnn --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:features2d
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_features2d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:imgcodecs
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgcodecs --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:imgproc
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgproc --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:objdetect
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_objdetect --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter="-Perf_Objdetect_QRCode.detect/2:Perf_Objdetect_QRCode_Multi.decodeMulti*:Perf_Objdetect_QRCode_Multi.detectMulti*"
+    - name: Performance:photo
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:stitching
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:superres
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_superres --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:video
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Performance:videoio
+      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_videoio --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
+    - name: Python3
+      run: |
+        cd ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/python/test
+        python3 ./test.py --repo ../../../ -v
+    - name: Java
+      run: cd /opencv-build && xvfb-run -a python3 ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/ts/misc/run.py . -a -t java
+    - name: Save Unit Test Results
+      uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: junit-html
+        path: /opencv-build/java_test/testResults/junit-noframes.html
+
+  BuildContrib:
+    runs-on: opencv-cn-lin-arm64
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
+    steps:
+    - name: PR info
+      run: |
+        echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
+        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
+    - name: Clean
+      run: find . -mindepth 1 -delete
+    - name: Fetch opencv
+      uses: actions/checkout@v3
+      with:
+        repository: opencv/opencv
+        ref: ${{ env.TARGET_BRANCH_NAME }}
+        fetch-depth: 0
+    - name: Merge opencv with a test branch
+      run: |
+        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
+        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
+        git config user.email "opencv.ci"
+        git config user.name "opencv.ci"
+        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
+    - name: Clone opencv_contrib
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
+    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
+      run: |
+        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
+          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_contrib
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
+    - name: Configure OpenCV Contrib
+      run: |
+        cmake -G Ninja -B /opencv-contrib-build ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules ${{ env.OPENCV_DOCKER_WORKDIR }}
+    - name: Build OpenCV Contrib
+      run: |
+        cd /opencv-contrib-build
+        ninja

From 602caa9cd61a7dd88e86d049d0e5e27be0e68ad7 Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Sat, 14 May 2022 02:32:05 +0900
Subject: [PATCH 033/178] Merge pull request #21937 from Kumataro:4.x-fix-21911

* Fix warnings for clang15

* Fix warnings: Remove unnecessary code

* Fix warnings: Remove unnecessary code
---
 3rdparty/ittnotify/CMakeLists.txt             |  1 +
 3rdparty/libpng/CMakeLists.txt                |  2 ++
 3rdparty/libtiff/CMakeLists.txt               |  1 +
 3rdparty/libwebp/CMakeLists.txt               |  1 +
 3rdparty/openjpeg/CMakeLists.txt              |  1 +
 modules/calib3d/src/calibinit.cpp             |  2 +-
 .../calib3d/test/test_cameracalibration.cpp   | 20 -------------------
 modules/calib3d/test/test_fisheye.cpp         |  3 +--
 modules/core/perf/opencl/perf_matop.cpp       |  1 +
 modules/core/src/datastructs.cpp              |  4 +---
 modules/core/src/lpsolver.cpp                 |  5 +----
 modules/core/src/trace.cpp                    |  3 +--
 modules/features2d/src/evaluation.cpp         |  2 ++
 modules/features2d/src/kaze/KAZEFeatures.cpp  |  3 +--
 modules/features2d/src/keypoint.cpp           |  4 +++-
 .../include/opencv2/flann/index_testing.h     |  2 --
 modules/gapi/src/compiler/passes/exec.cpp     |  4 +++-
 modules/gapi/test/s11n/gapi_s11n_tests.cpp    |  2 --
 modules/imgproc/test/test_approxpoly.cpp      |  1 +
 modules/ml/src/kdtree.cpp                     |  3 ---
 modules/stitching/src/motion_estimators.cpp   |  4 ++++
 modules/video/test/test_optflowpyrlk.cpp      |  9 ++-------
 modules/video/test/test_trackers.impl.hpp     |  2 --
 23 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/3rdparty/ittnotify/CMakeLists.txt b/3rdparty/ittnotify/CMakeLists.txt
index a227aff88e..0f39adcd4b 100644
--- a/3rdparty/ittnotify/CMakeLists.txt
+++ b/3rdparty/ittnotify/CMakeLists.txt
@@ -54,6 +54,7 @@ set_target_properties(${ITT_LIBRARY} PROPERTIES
     )
 
 ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wsign-compare)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wstrict-prototypes) # clang15
 
 if(ENABLE_SOLUTION_FOLDERS)
   set_target_properties(${ITT_LIBRARY} PROPERTIES FOLDER "3rdparty")
diff --git a/3rdparty/libpng/CMakeLists.txt b/3rdparty/libpng/CMakeLists.txt
index efa59627eb..bb0f3dd2c5 100644
--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@@ -78,6 +78,8 @@ add_library(${PNG_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${lib_srcs
 target_link_libraries(${PNG_LIBRARY} ${ZLIB_LIBRARIES})
 
 ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wcast-align -Wimplicit-fallthrough -Wunused-parameter -Wsign-compare)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wnull-pointer-subtraction)  # clang15
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-but-set-variable)  # clang15
 
 set_target_properties(${PNG_LIBRARY}
   PROPERTIES OUTPUT_NAME ${PNG_LIBRARY}
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 2074888a52..d37cfff337 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -454,6 +454,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-protot
                                    -Wimplicit-fallthrough
 )
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
+ocv_warnings_disable(CMAKE_C_FLAGS -Wstrict-prototypes) # clang15
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter -Wmissing-prototypes
     -Wundef  # tiffiop.h: #if __clang_major__ >= 4
 )
diff --git a/3rdparty/libwebp/CMakeLists.txt b/3rdparty/libwebp/CMakeLists.txt
index 9160e2024c..723575c8db 100644
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@@ -45,6 +45,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wunused-function -Wshadow
     -Wmissing-prototypes  # clang
     -Wmissing-declarations # gcc
     -Wimplicit-fallthrough
+    -Wunused-but-set-variable # clang15
 )
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4244 /wd4267) # vs2005
 
diff --git a/3rdparty/openjpeg/CMakeLists.txt b/3rdparty/openjpeg/CMakeLists.txt
index fe766101d0..42730bab2b 100644
--- a/3rdparty/openjpeg/CMakeLists.txt
+++ b/3rdparty/openjpeg/CMakeLists.txt
@@ -13,6 +13,7 @@ project(openjpeg C)
 
 ocv_warnings_disable(CMAKE_C_FLAGS
     -Wimplicit-const-int-float-conversion  # clang
+    -Wunused-but-set-variable # clang15
 )
 
 #-----------------------------------------------------------------------------
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index e25dd26d21..9e96802388 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -1469,7 +1469,7 @@ int ChessBoardDetector::checkQuadGroup(std::vector<ChessBoardQuad*>& quad_group,
     first = below; // remember the first corner in the next row
 
     // find and store the first row (or column)
-    for (int j = 1; ; ++j)
+    while( 1 )
     {
         right->row = 0;
         out_corners.push_back(right);
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index c1704ad928..1c04bb78dd 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -568,12 +568,7 @@ void CV_CameraCalibrationTest::run( int start_from )
         /* ----- Compute reprojection error ----- */
         double dx,dy;
         double rx,ry;
-        double meanDx,meanDy;
-        double maxDx = 0.0;
-        double maxDy = 0.0;
 
-        meanDx = 0;
-        meanDy = 0;
         for( currImage = 0; currImage < numImages; currImage++ )
         {
             double imageMeanDx = 0;
@@ -585,20 +580,8 @@ void CV_CameraCalibrationTest::run( int start_from )
                 dx = rx - imagePoints[currImage][currPoint].x;
                 dy = ry - imagePoints[currImage][currPoint].y;
 
-                meanDx += dx;
-                meanDy += dy;
-
                 imageMeanDx += dx*dx;
                 imageMeanDy += dy*dy;
-
-                dx = fabs(dx);
-                dy = fabs(dy);
-
-                if( dx > maxDx )
-                    maxDx = dx;
-
-                if( dy > maxDy )
-                    maxDy = dy;
             }
             goodPerViewErrors[currImage] = sqrt( (imageMeanDx + imageMeanDy) /
                                            (etalonSize.width * etalonSize.height));
@@ -609,9 +592,6 @@ void CV_CameraCalibrationTest::run( int start_from )
                 perViewErrors[currImage] = goodPerViewErrors[currImage];
         }
 
-        meanDx /= numImages * etalonSize.width * etalonSize.height;
-        meanDy /= numImages * etalonSize.width * etalonSize.height;
-
         /* ========= Compare parameters ========= */
         CV_Assert(cameraMatrix.type() == CV_64F && cameraMatrix.size() == Size(3, 3));
         CV_Assert(distortion.type() == CV_64F);
diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index ad3b066a6a..7e40493db3 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -227,7 +227,7 @@ TEST_F(fisheyeTest, undistortAndDistortImage)
     cv::Mat undPointsGt(imageHeight, imageWidth, CV_32FC2);
     cv::Mat imageGt(imageHeight, imageWidth, CV_8UC3);
 
-    for(int y = 0, k = 0; y < imageHeight; ++y)
+    for(int y = 0; y < imageHeight; ++y)
     {
         for(int x = 0; x < imageWidth; ++x)
         {
@@ -261,7 +261,6 @@ TEST_F(fisheyeTest, undistortAndDistortImage)
                 pixel_gt[2] = pixel[2];
             }
 
-            k++;
         }
     }
 
diff --git a/modules/core/perf/opencl/perf_matop.cpp b/modules/core/perf/opencl/perf_matop.cpp
index 5be1f431f6..b763a98e2a 100644
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@@ -233,6 +233,7 @@ PERF_TEST_P_(OpenCLBuffer, cpu_read)
             for (size_t x_bytes = 0; x_bytes < width_bytes; x_bytes++)
                 counter += (unsigned)(ptr[x_bytes]);
         }
+        (void)counter; // To avoid -Wunused-but-set-variable
     }
 
     SANITY_CHECK_NOTHING();
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index eec7ade02b..80b02283dc 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -133,8 +133,6 @@ cvCreateChildMemStorage( CvMemStorage * parent )
 static void
 icvDestroyMemStorage( CvMemStorage* storage )
 {
-    int k = 0;
-
     CvMemBlock *block;
     CvMemBlock *dst_top = 0;
 
@@ -144,7 +142,7 @@ icvDestroyMemStorage( CvMemStorage* storage )
     if( storage->parent )
         dst_top = storage->parent->top;
 
-    for( block = storage->bottom; block != 0; k++ )
+    for( block = storage->bottom; block != 0; )
     {
         CvMemBlock *temp = block;
 
diff --git a/modules/core/src/lpsolver.cpp b/modules/core/src/lpsolver.cpp
index 951da3fd7f..43fa73a025 100644
--- a/modules/core/src/lpsolver.cpp
+++ b/modules/core/src/lpsolver.cpp
@@ -253,11 +253,8 @@ static int initialize_simplex(Mat_<double>& c, Mat_<double>& b,double& v,vector<
 }
 
 static int inner_simplex(Mat_<double>& c, Mat_<double>& b,double& v,vector<int>& N,vector<int>& B,vector<unsigned int>& indexToRow){
-    int count=0;
-    for(;;){
-        dprintf(("iteration #%d\n",count));
-        count++;
 
+    for(;;){
         static MatIterator_<double> pos_ptr;
         int e=-1,pos_ctr=0,min_var=INT_MAX;
         bool all_nonzero=true;
diff --git a/modules/core/src/trace.cpp b/modules/core/src/trace.cpp
index 1011db5e3b..05acbde7c9 100644
--- a/modules/core/src/trace.cpp
+++ b/modules/core/src/trace.cpp
@@ -988,7 +988,6 @@ void parallelForFinalize(const Region& rootRegion)
     std::vector<TraceManagerThreadLocal*> threads_ctx;
     getTraceManager().tls.gather(threads_ctx);
     RegionStatistics parallel_for_stat;
-    int threads = 0;
     for (size_t i = 0; i < threads_ctx.size(); i++)
     {
         TraceManagerThreadLocal* child_ctx = threads_ctx[i];
@@ -996,7 +995,6 @@ void parallelForFinalize(const Region& rootRegion)
         if (child_ctx && child_ctx->stackTopRegion() == &rootRegion)
         {
             CV_LOG_PARALLEL(NULL, "Thread=" << child_ctx->threadID << " " << child_ctx->stat);
-            threads++;
             RegionStatistics child_stat;
             child_ctx->stat.grab(child_stat);
             parallel_for_stat.append(child_stat);
@@ -1012,6 +1010,7 @@ void parallelForFinalize(const Region& rootRegion)
             }
         }
     }
+
     float parallel_coeff = std::min(1.0f, duration / (float)(parallel_for_stat.duration));
     CV_LOG_PARALLEL(NULL, "parallel_coeff=" << 1.0f / parallel_coeff);
     CV_LOG_PARALLEL(NULL, parallel_for_stat);
diff --git a/modules/features2d/src/evaluation.cpp b/modules/features2d/src/evaluation.cpp
index ca7ab14500..7135891e0d 100644
--- a/modules/features2d/src/evaluation.cpp
+++ b/modules/features2d/src/evaluation.cpp
@@ -315,6 +315,8 @@ struct SIdx
         const SIdx& used;
         bool operator()(const SIdx& v) const { return  (v.i1 == used.i1 || v.i2 == used.i2); }
         UsedFinder& operator=(const UsedFinder&) = delete;
+        // To avoid -Wdeprecated-copy warning, copy constructor is needed.
+        UsedFinder(const UsedFinder&) = default;
     };
 };
 
diff --git a/modules/features2d/src/kaze/KAZEFeatures.cpp b/modules/features2d/src/kaze/KAZEFeatures.cpp
index ab591d417c..7bd0ba6dcd 100644
--- a/modules/features2d/src/kaze/KAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/KAZEFeatures.cpp
@@ -312,7 +312,7 @@ void KAZEFeatures::Determinant_Hessian(std::vector<KeyPoint>& kpts)
 {
     int level = 0;
     float smax = 3.0;
-    int npoints = 0, id_repeated = 0;
+    int id_repeated = 0;
     int left_x = 0, right_x = 0, up_y = 0, down_y = 0;
     bool is_extremum = false, is_repeated = false, is_out = false;
 
@@ -383,7 +383,6 @@ void KAZEFeatures::Determinant_Hessian(std::vector<KeyPoint>& kpts)
                 if (is_out == false) {
                     if (is_repeated == false) {
                         kpts.push_back(kpts_par_ij);
-                        npoints++;
                     }
                     else {
                         kpts[id_repeated] = kpts_par_ij;
diff --git a/modules/features2d/src/keypoint.cpp b/modules/features2d/src/keypoint.cpp
index e14c9da94c..21d9eb30f7 100644
--- a/modules/features2d/src/keypoint.cpp
+++ b/modules/features2d/src/keypoint.cpp
@@ -148,10 +148,12 @@ public:
     {
         return mask.at<uchar>( (int)(key_pt.pt.y + 0.5f), (int)(key_pt.pt.x + 0.5f) ) == 0;
     }
+    MaskPredicate& operator=(const MaskPredicate&) = delete;
+    // To avoid -Wdeprecated-copy warning, copy constructor is needed.
+    MaskPredicate(const MaskPredicate&) = default;
 
 private:
     const Mat mask;
-    MaskPredicate& operator=(const MaskPredicate&) = delete;
 };
 
 void KeyPointsFilter::runByPixelsMask( std::vector<KeyPoint>& keypoints, const Mat& mask )
diff --git a/modules/flann/include/opencv2/flann/index_testing.h b/modules/flann/include/opencv2/flann/index_testing.h
index 207adef449..4c00143326 100644
--- a/modules/flann/include/opencv2/flann/index_testing.h
+++ b/modules/flann/include/opencv2/flann/index_testing.h
@@ -246,7 +246,6 @@ void test_index_precisions(NNIndex<Distance>& index, const Matrix<typename Dista
     float p2;
 
     int c1 = 1;
-    float p1;
 
     float time;
     DistanceType dist;
@@ -270,7 +269,6 @@ void test_index_precisions(NNIndex<Distance>& index, const Matrix<typename Dista
         precision = precisions[i];
         while (p2<precision) {
             c1 = c2;
-            p1 = p2;
             c2 *=2;
             p2 = search_with_ground_truth(index, inputData, testData, matches, nn, c2, time, dist, distance, skipMatches);
             if ((maxTime> 0)&&(time > maxTime)&&(p2<precision)) return;
diff --git a/modules/gapi/src/compiler/passes/exec.cpp b/modules/gapi/src/compiler/passes/exec.cpp
index f6a73489eb..93d833d602 100644
--- a/modules/gapi/src/compiler/passes/exec.cpp
+++ b/modules/gapi/src/compiler/passes/exec.cpp
@@ -606,7 +606,9 @@ namespace
         MergeContext mc;
 
         bool there_was_a_merge = false;
+#ifdef DEBUG_MERGE
         std::size_t iteration = 0u;
+#endif
         do
         {
             there_was_a_merge = false;
@@ -615,8 +617,8 @@ namespace
 #ifdef DEBUG_MERGE
             GAPI_LOG_INFO(NULL, "Before next merge attempt " << iteration << "...");
             merge_debug(g, iteration);
-#endif
             iteration++;
+#endif
             auto sorted = pass_helpers::topoSort(im);
             for (auto nh : sorted)
             {
diff --git a/modules/gapi/test/s11n/gapi_s11n_tests.cpp b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
index 94b99f877a..22398ec92c 100644
--- a/modules/gapi/test/s11n/gapi_s11n_tests.cpp
+++ b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
@@ -569,7 +569,6 @@ TEST_F(S11N_Basic, Test_Bind_RunArgs_MatScalar) {
     v[0] = cv::GRunArg{ mat };
     v[1] = cv::GRunArg{ scalar };
     GRunArgsP output = cv::gapi::bind(v);
-    unsigned int i = 0;
     for (auto it : output)
     {
         using T = cv::GRunArgP;
@@ -591,7 +590,6 @@ TEST_F(S11N_Basic, Test_Bind_RunArgs_MatScalar) {
             GAPI_Assert(false && "This value type is not supported!"); // ...maybe because of STANDALONE mode.
             break;
         }
-        i++;
     }
 }
 
diff --git a/modules/imgproc/test/test_approxpoly.cpp b/modules/imgproc/test/test_approxpoly.cpp
index aace416b97..f09475c9fc 100644
--- a/modules/imgproc/test/test_approxpoly.cpp
+++ b/modules/imgproc/test/test_approxpoly.cpp
@@ -220,6 +220,7 @@ int CV_ApproxPolyTest::check_slice( CvPoint StartPt, CvPoint EndPt,
 
     *_j = j;
 
+    (void) TotalErrors; // To avoid -Wunused-but-set-variable warning
     //return TotalErrors;
     return 0;
 }
diff --git a/modules/ml/src/kdtree.cpp b/modules/ml/src/kdtree.cpp
index a80e12964a..8cdab98f73 100644
--- a/modules/ml/src/kdtree.cpp
+++ b/modules/ml/src/kdtree.cpp
@@ -120,16 +120,13 @@ medianPartition( size_t* ofs, int a, int b, const float* vals )
     }
 
     float pivot = vals[ofs[middle]];
-    int less = 0, more = 0;
     for( k = a0; k < middle; k++ )
     {
         CV_Assert(vals[ofs[k]] <= pivot);
-        less += vals[ofs[k]] < pivot;
     }
     for( k = b0; k > middle; k-- )
     {
         CV_Assert(vals[ofs[k]] >= pivot);
-        more += vals[ofs[k]] > pivot;
     }
 
     return vals[ofs[middle]];
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index c0b46b101d..29616a4915 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -262,7 +262,9 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
     CvMat matParams = cvMat(cam_params_);
     cvCopy(&matParams, solver.param);
 
+#if ENABLE_LOG
     int iter = 0;
+#endif
     for(;;)
     {
         const CvMat* _param = 0;
@@ -287,7 +289,9 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
         {
             calcError(err);
             LOG_CHAT(".");
+#if ENABLE_LOG
             iter++;
+#endif
             CvMat tmp = cvMat(err);
             cvCopy(&tmp, _err);
         }
diff --git a/modules/video/test/test_optflowpyrlk.cpp b/modules/video/test/test_optflowpyrlk.cpp
index 905cfebafa..a79a0ff4e4 100644
--- a/modules/video/test/test_optflowpyrlk.cpp
+++ b/modules/video/test/test_optflowpyrlk.cpp
@@ -64,10 +64,9 @@ void CV_OptFlowPyrLKTest::run( int )
     const int bad_points_max = 8;
 
     /* test parameters */
-    double  max_err = 0., sum_err = 0;
-    int     pt_cmpd = 0;
+    double  max_err = 0.;
     int     pt_exceed = 0;
-    int     merr_i = 0, merr_j = 0, merr_k = 0, merr_nan = 0;
+    int     merr_i = 0, merr_nan = 0;
     char    filename[1000];
 
     cv::Point2f *v = 0, *v2 = 0;
@@ -155,7 +154,6 @@ void CV_OptFlowPyrLKTest::run( int )
             double err;
             if( cvIsNaN(v[i].x) || cvIsNaN(v[i].y) )
             {
-                merr_j++;
                 continue;
             }
 
@@ -173,15 +171,12 @@ void CV_OptFlowPyrLKTest::run( int )
             }
 
             pt_exceed += err > success_error_level;
-            sum_err += err;
-            pt_cmpd++;
         }
         else
         {
             if( !cvIsNaN( v[i].x ))
             {
                 merr_i = i;
-                merr_k++;
                 ts->printf( cvtest::TS::LOG, "The algorithm lost the point #%d\n", i );
                 code = cvtest::TS::FAIL_BAD_ACCURACY;
                 break;
diff --git a/modules/video/test/test_trackers.impl.hpp b/modules/video/test/test_trackers.impl.hpp
index 5ae98d1eb8..fc2315ced0 100644
--- a/modules/video/test/test_trackers.impl.hpp
+++ b/modules/video/test/test_trackers.impl.hpp
@@ -336,7 +336,6 @@ void TrackerTest<Tracker, ROI_t>::checkDataTest()
     gt2.open(gtFile.c_str());
     ASSERT_TRUE(gt2.is_open()) << gtFile;
     string line2;
-    int bbCounter2 = 0;
     while (getline(gt2, line2))
     {
         vector<string> tokens = splitString(line2, ",");
@@ -344,7 +343,6 @@ void TrackerTest<Tracker, ROI_t>::checkDataTest()
         ASSERT_EQ((size_t)4, tokens.size()) << "Incorrect ground truth file " << gtFile;
 
         bbs.push_back(bb);
-        bbCounter2++;
     }
     gt2.close();
 

From affcea8822b92967cb7c213f5f3cefb3171b286e Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Fri, 13 May 2022 15:38:51 +0300
Subject: [PATCH 034/178] Added opencv_contrib branch check on Windows

---
 .github/workflows/PR-3.4-W10.yaml | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index ec9614ccd4..2cfed6a32e 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -42,8 +42,8 @@ jobs:
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       shell: bash
       run: |
-        RET=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$RET" ]]; then
+        OPENCV_EXTRA_FORK=$(git ls-remote --heads "git@github.com:/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
           echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
           cd opencv_extra
           git config user.email "opencv.ci"
@@ -154,6 +154,19 @@ jobs:
         git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Fetch opencv_contrib
       run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
+    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
+      shell: bash
+      run: |
+        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
+        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
+          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
+          cd /opencv_contrib
+          git config user.email "opencv.ci"
+          git config user.name "opencv.ci"
+          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
+        else
+          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
+        fi
     - name: Configure OpenCV Contrib
       run: |
         mkdir ${{ github.workspace }}\opencv-contrib-build && cd ${{ github.workspace }}\opencv-contrib-build

From 8d0fbc6a1e9f20c822921e8076551a01e58cd632 Mon Sep 17 00:00:00 2001
From: Manolis Lourakis <60357220+mlourakis@users.noreply.github.com>
Date: Fri, 13 May 2022 21:11:14 +0300
Subject: [PATCH 035/178] Merge pull request #21702 from mlourakis:4.x

Fixes and optimizations for the SQPnP solver

* Fixes and optimizations

- optimized the calculation of qa_sum by moving equal elements outside the loop
- unrolled copying of the lower triangle of omega
- substituted SVD with eigendecomposition in the factorization of omega (2-3 times faster)
- fixed the initialization of lambda in FOAM
- added a cheirality test that checks a solution on all 3D points rather than on their mean. The old test rejected valid poses in some cases
- fixed some typos & errors in comments

* reverted to SVD

Eigen decomposition seems to yield larger errors in certain tests, reverted to SVD

* nearestRotationMatrixSVD

Added nearestRotationMatrixSVD()
Previous nearestRotationMatrix() renamed to nearestRotationMatrixFOAM() and reverts to nearestRotationMatrixSVD() for singular matrices

* fixed checks order

Fixed the order of checks in PoseSolver::solveInternal()
---
 modules/calib3d/src/sqpnp.cpp | 134 +++++++++++++++++++++++-----------
 modules/calib3d/src/sqpnp.hpp |  47 ++++++++----
 2 files changed, 125 insertions(+), 56 deletions(-)

diff --git a/modules/calib3d/src/sqpnp.cpp b/modules/calib3d/src/sqpnp.cpp
index 7117e61c96..3e0d7ace49 100644
--- a/modules/calib3d/src/sqpnp.cpp
+++ b/modules/calib3d/src/sqpnp.cpp
@@ -118,7 +118,7 @@ void PoseSolver::solve(InputArray objectPoints, InputArray imagePoints, OutputAr
     num_solutions_ = 0;
 
     computeOmega(_objectPoints, _imagePoints);
-    solveInternal();
+    solveInternal(_objectPoints);
 
     int depthRot = rvecs.fixedType() ? rvecs.depth() : CV_64F;
     int depthTrans = tvecs.fixedType() ? tvecs.depth() : CV_64F;
@@ -194,37 +194,41 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
         omega_(7, 7) += sq_norm * Y2; omega_(7, 8) += sq_norm * YZ;
         omega_(8, 8) += sq_norm * Z2;
 
-        //Compute qa_sum
+        //Compute qa_sum. Certain pairs of elements are equal, so filling them outside the loop saves some operations
         qa_sum(0, 0) += X; qa_sum(0, 1) += Y; qa_sum(0, 2) += Z;
-        qa_sum(1, 3) += X; qa_sum(1, 4) += Y; qa_sum(1, 5) += Z;
 
         qa_sum(0, 6) += -x * X; qa_sum(0, 7) += -x * Y; qa_sum(0, 8) += -x * Z;
         qa_sum(1, 6) += -y * X; qa_sum(1, 7) += -y * Y; qa_sum(1, 8) += -y * Z;
 
-        qa_sum(2, 0) += -x * X; qa_sum(2, 1) += -x * Y; qa_sum(2, 2) += -x * Z;
-        qa_sum(2, 3) += -y * X; qa_sum(2, 4) += -y * Y; qa_sum(2, 5) += -y * Z;
-
         qa_sum(2, 6) += sq_norm * X; qa_sum(2, 7) += sq_norm * Y; qa_sum(2, 8) += sq_norm * Z;
     }
 
+    //Complete qa_sum
+    qa_sum(1, 3) = qa_sum(0, 0); qa_sum(1, 4) = qa_sum(0, 1); qa_sum(1, 5) = qa_sum(0, 2);
+    qa_sum(2, 0) = qa_sum(0, 6); qa_sum(2, 1) = qa_sum(0, 7); qa_sum(2, 2) = qa_sum(0, 8);
+    qa_sum(2, 3) = qa_sum(1, 6); qa_sum(2, 4) = qa_sum(1, 7); qa_sum(2, 5) = qa_sum(1, 8);
+
 
+    //lower triangles of omega_'s off-diagonal blocks (0:2, 6:8), (3:5, 6:8) and (6:8, 6:8)
     omega_(1, 6) = omega_(0, 7); omega_(2, 6) = omega_(0, 8); omega_(2, 7) = omega_(1, 8);
     omega_(4, 6) = omega_(3, 7); omega_(5, 6) = omega_(3, 8); omega_(5, 7) = omega_(4, 8);
     omega_(7, 6) = omega_(6, 7); omega_(8, 6) = omega_(6, 8); omega_(8, 7) = omega_(7, 8);
 
-
+    //upper triangle of omega_'s block (3:5, 3:5)
     omega_(3, 3) = omega_(0, 0); omega_(3, 4) = omega_(0, 1); omega_(3, 5) = omega_(0, 2);
-    omega_(4, 4) = omega_(1, 1); omega_(4, 5) = omega_(1, 2);
-    omega_(5, 5) = omega_(2, 2);
-
-    //Mirror upper triangle to lower triangle
-    for (int r = 0; r < 9; r++)
-    {
-        for (int c = 0; c < r; c++)
-        {
-            omega_(r, c) = omega_(c, r);
-        }
-    }
+                                 omega_(4, 4) = omega_(1, 1); omega_(4, 5) = omega_(1, 2);
+                                                              omega_(5, 5) = omega_(2, 2);
+
+    //Mirror omega_'s upper triangle to lower triangle
+    //Note that elements (7, 6), (8, 6) & (8, 7) have already been assigned above
+    omega_(1, 0) = omega_(0, 1);
+    omega_(2, 0) = omega_(0, 2); omega_(2, 1) = omega_(1, 2);
+    omega_(3, 0) = omega_(0, 3); omega_(3, 1) = omega_(1, 3); omega_(3, 2) = omega_(2, 3);
+    omega_(4, 0) = omega_(0, 4); omega_(4, 1) = omega_(1, 4); omega_(4, 2) = omega_(2, 4); omega_(4, 3) = omega_(3, 4);
+    omega_(5, 0) = omega_(0, 5); omega_(5, 1) = omega_(1, 5); omega_(5, 2) = omega_(2, 5); omega_(5, 3) = omega_(3, 5); omega_(5, 4) = omega_(4, 5);
+    omega_(6, 0) = omega_(0, 6); omega_(6, 1) = omega_(1, 6); omega_(6, 2) = omega_(2, 6); omega_(6, 3) = omega_(3, 6); omega_(6, 4) = omega_(4, 6); omega_(6, 5) = omega_(5, 6);
+    omega_(7, 0) = omega_(0, 7); omega_(7, 1) = omega_(1, 7); omega_(7, 2) = omega_(2, 7); omega_(7, 3) = omega_(3, 7); omega_(7, 4) = omega_(4, 7); omega_(7, 5) = omega_(5, 7);
+    omega_(8, 0) = omega_(0, 8); omega_(8, 1) = omega_(1, 8); omega_(8, 2) = omega_(2, 8); omega_(8, 3) = omega_(3, 8); omega_(8, 4) = omega_(4, 8); omega_(8, 5) = omega_(5, 8);
 
     cv::Matx<double, 3, 3> q;
     q(0, 0) = n; q(0, 1) = 0; q(0, 2) = -sum_img.x;
@@ -247,6 +251,11 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
     cv::SVD omega_svd(omega_, cv::SVD::FULL_UV);
     s_ = omega_svd.w;
     u_ = cv::Mat(omega_svd.vt.t());
+#if 0
+    // EVD equivalent of the SVD; less accurate
+    cv::eigen(omega_, s_, u_);
+    u_ = u_.t(); // eigenvectors were returned as rows
+#endif
 
     CV_Assert(s_(0) >= 1e-7);
 
@@ -257,7 +266,7 @@ void PoseSolver::computeOmega(InputArray objectPoints, InputArray imagePoints)
     point_mean_ = cv::Vec3d(sum_obj.x / n, sum_obj.y / n, sum_obj.z / n);
 }
 
-void PoseSolver::solveInternal()
+void PoseSolver::solveInternal(InputArray objectPoints)
 {
     double min_sq_err = std::numeric_limits<double>::max();
     int num_eigen_points = num_null_vectors_ > 0 ? num_null_vectors_ : 1;
@@ -274,42 +283,39 @@ void PoseSolver::solveInternal()
         {
             solutions[0].r_hat = det3x3(e) * e;
             solutions[0].t = p_ * solutions[0].r_hat;
-            checkSolution(solutions[0], min_sq_err);
+            checkSolution(solutions[0], objectPoints, min_sq_err);
         }
         else
         {
             Matx<double, 9, 1> r;
-            nearestRotationMatrix(e, r);
+            nearestRotationMatrixFOAM(e, r);
             solutions[0] = runSQP(r);
             solutions[0].t = p_ * solutions[0].r_hat;
-            checkSolution(solutions[0], min_sq_err);
+            checkSolution(solutions[0], objectPoints, min_sq_err);
 
-            nearestRotationMatrix(-e, r);
+            nearestRotationMatrixFOAM(-e, r);
             solutions[1] = runSQP(r);
             solutions[1].t = p_ * solutions[1].r_hat;
-            checkSolution(solutions[1], min_sq_err);
+            checkSolution(solutions[1], objectPoints, min_sq_err);
         }
     }
 
-    int c = 1;
-
-    while (min_sq_err > 3 * s_[9 - num_eigen_points - c] && 9 - num_eigen_points - c > 0)
+    int index, c = 1;
+    while ((index = 9 - num_eigen_points - c) > 0 && min_sq_err > 3 * s_[index])
     {
-        int index = 9 - num_eigen_points - c;
-
         const cv::Matx<double, 9, 1> e = u_.col(index);
         SQPSolution solutions[2];
 
         Matx<double, 9, 1> r;
-        nearestRotationMatrix(e, r);
+        nearestRotationMatrixFOAM(e, r);
         solutions[0] = runSQP(r);
         solutions[0].t = p_ * solutions[0].r_hat;
-        checkSolution(solutions[0], min_sq_err);
+        checkSolution(solutions[0], objectPoints, min_sq_err);
 
-        nearestRotationMatrix(-e, r);
+        nearestRotationMatrixFOAM(-e, r);
         solutions[1] = runSQP(r);
         solutions[1].t = p_ * solutions[1].r_hat;
-        checkSolution(solutions[1], min_sq_err);
+        checkSolution(solutions[1], objectPoints, min_sq_err);
 
         c++;
     }
@@ -341,7 +347,7 @@ PoseSolver::SQPSolution PoseSolver::runSQP(const cv::Matx<double, 9, 1>& r0)
 
     if (det_r > SQP_DET_THRESHOLD)
     {
-        nearestRotationMatrix(r, solution.r_hat);
+        nearestRotationMatrixFOAM(r, solution.r_hat);
     }
     else
     {
@@ -615,12 +621,26 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
 
 }
 
-// faster nearest rotation computation based on FOAM (see: http://users.ics.forth.gr/~lourakis/publ/2018_iros.pdf )
+// if e = u*w*vt then r=u*diag([1, 1, det(u)*det(v)])*vt
+void PoseSolver::nearestRotationMatrixSVD(const cv::Matx<double, 9, 1>& e,
+    cv::Matx<double, 9, 1>& r)
+{
+    cv::Matx<double, 3, 3> e33 = e.reshape<3, 3>();
+    cv::SVD e33_svd(e33, cv::SVD::FULL_UV);
+    double detuv = cv::determinant(e33_svd.u)*cv::determinant(e33_svd.vt);
+    cv::Matx<double, 3, 3> diag = cv::Matx33d::eye();
+    diag(2, 2) = detuv;
+    cv::Matx<double, 3, 3> r33 = cv::Mat(e33_svd.u*diag*e33_svd.vt);
+    r = r33.reshape<9, 1>();
+}
+
+// Faster nearest rotation computation based on FOAM. See M. Lourakis: "An Efficient Solution to Absolute Orientation", ICPR 2016
+// and M. Lourakis, G. Terzakis: "Efficient Absolute Orientation Revisited", IROS 2018.
 /* Solve the nearest orthogonal approximation problem
     * i.e., given e, find R minimizing ||R-e||_F
     *
     * The computation borrows from Markley's FOAM algorithm
-    * "Attitude Determination Using Vector Observations: A Fast Optimal Matrix Algorithm", J. Astronaut. Sci.
+    * "Attitude Determination Using Vector Observations: A Fast Optimal Matrix Algorithm", J. Astronaut. Sci. 1993.
     *
     * See also M. Lourakis: "An Efficient Solution to Absolute Orientation", ICPR 2016
     *
@@ -628,24 +648,32 @@ void PoseSolver::computeRowAndNullspace(const cv::Matx<double, 9, 1>& r,
     *  Institute of Computer Science, Foundation for Research & Technology - Hellas
     *  Heraklion, Crete, Greece.
     */
-void PoseSolver::nearestRotationMatrix(const cv::Matx<double, 9, 1>& e,
+void PoseSolver::nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
     cv::Matx<double, 9, 1>& r)
 {
     int i;
     double l, lprev, det_e, e_sq, adj_e_sq, adj_e[9];
 
+    // det(e)
+    det_e = e(0) * e(4) * e(8) - e(0) * e(5) * e(7) - e(1) * e(3) * e(8) + e(2) * e(3) * e(7) + e(1) * e(6) * e(5) - e(2) * e(6) * e(4);
+    if (fabs(det_e) < 1E-04) { // singular, handle it with SVD
+        PoseSolver::nearestRotationMatrixSVD(e, r);
+        return;
+    }
+
     // e's adjoint
     adj_e[0] = e(4) * e(8) - e(5) * e(7); adj_e[1] = e(2) * e(7) - e(1) * e(8); adj_e[2] = e(1) * e(5) - e(2) * e(4);
     adj_e[3] = e(5) * e(6) - e(3) * e(8); adj_e[4] = e(0) * e(8) - e(2) * e(6); adj_e[5] = e(2) * e(3) - e(0) * e(5);
     adj_e[6] = e(3) * e(7) - e(4) * e(6); adj_e[7] = e(1) * e(6) - e(0) * e(7); adj_e[8] = e(0) * e(4) - e(1) * e(3);
 
-    // det(e), ||e||^2, ||adj(e)||^2
-    det_e = e(0) * e(4) * e(8) - e(0) * e(5) * e(7) - e(1) * e(3) * e(8) + e(2) * e(3) * e(7) + e(1) * e(6) * e(5) - e(2) * e(6) * e(4);
+    // ||e||^2, ||adj(e)||^2
     e_sq = e(0) * e(0) + e(1) * e(1) + e(2) * e(2) + e(3) * e(3) + e(4) * e(4) + e(5) * e(5) + e(6) * e(6) + e(7) * e(7) + e(8) * e(8);
     adj_e_sq = adj_e[0] * adj_e[0] + adj_e[1] * adj_e[1] + adj_e[2] * adj_e[2] + adj_e[3] * adj_e[3] + adj_e[4] * adj_e[4] + adj_e[5] * adj_e[5] + adj_e[6] * adj_e[6] + adj_e[7] * adj_e[7] + adj_e[8] * adj_e[8];
 
     // compute l_max with Newton-Raphson from FOAM's characteristic polynomial, i.e. eq.(23) - (26)
-    for (i = 200, l = 2.0, lprev = 0.0; fabs(l - lprev) > 1E-12 * fabs(lprev) && i > 0; --i) {
+    l = 0.5*(e_sq + 3.0); // 1/2*(trace(mat(e)*mat(e)') + trace(eye(3)))
+    if (det_e < 0.0) l = -l;
+    for (i = 15, lprev = 0.0; fabs(l - lprev) > 1E-12 * fabs(lprev) && i > 0; --i) {
         double tmp, p, pp;
 
         tmp = (l * l - e_sq);
@@ -719,9 +747,31 @@ inline bool PoseSolver::positiveDepth(const SQPSolution& solution) const
     return (r(6) * mean(0) + r(7) * mean(1) + r(8) * mean(2) + t(2) > 0);
 }
 
-void PoseSolver::checkSolution(SQPSolution& solution, double& min_error)
+inline bool PoseSolver::positiveMajorityDepths(const SQPSolution& solution, InputArray objectPoints) const
+{
+    const cv::Matx<double, 9, 1>& r = solution.r_hat;
+    const cv::Matx<double, 3, 1>& t = solution.t;
+    int npos = 0, nneg = 0;
+
+    Mat _objectPoints = objectPoints.getMat();
+
+    int n = _objectPoints.cols * _objectPoints.rows;
+
+    for (int i = 0; i < n; i++)
+    {
+        const cv::Point3d& obj_pt = _objectPoints.at<cv::Point3d>(i);
+        if (r(6) * obj_pt.x + r(7) * obj_pt.y + r(8) * obj_pt.z + t(2) > 0) ++npos;
+        else ++nneg;
+    }
+
+    return npos >= nneg;
+}
+
+
+void PoseSolver::checkSolution(SQPSolution& solution, InputArray objectPoints, double& min_error)
 {
-    if (positiveDepth(solution))
+    bool cheirok = positiveDepth(solution) || positiveMajorityDepths(solution, objectPoints); // check the majority if the check with centroid fails
+    if (cheirok)
     {
         solution.sq_error = (omega_ * solution.r_hat).ddot(solution.r_hat);
         if (fabs(min_error - solution.sq_error) > EQUAL_SQUARED_ERRORS_DIFF)
diff --git a/modules/calib3d/src/sqpnp.hpp b/modules/calib3d/src/sqpnp.hpp
index 97c10e34e7..078c07e906 100644
--- a/modules/calib3d/src/sqpnp.hpp
+++ b/modules/calib3d/src/sqpnp.hpp
@@ -85,13 +85,14 @@ private:
 
     /*
     * @brief                Computes the 9x9 PSD Omega matrix and supporting matrices.
+    * @param objectPoints   The 3D points in object coordinates.
     */
-    void solveInternal();
+    void solveInternal(InputArray objectPoints);
 
     /*
     * @brief                Produces the distance from being orthogonal for a given 3x3 matrix
-    *                       in row-major form.
-    * @param e              The vector to test representing a 3x3 matrix in row major form.
+    *                       in row-major order.
+    * @param e              The vector to test representing a 3x3 matrix in row-major order.
     * @return               The distance the matrix is from being orthogonal.
     */
     static double orthogonalityError(const cv::Matx<double, 9, 1>& e);
@@ -99,31 +100,49 @@ private:
     /*
     * @brief                Processes a solution and sorts it by error.
     * @param solution       The solution to evaluate.
-    * @param min_error          The current minimum error.
+    * @param objectPoints   The 3D points in object coordinates.
+    * @param min_error      The current minimum error.
     */
-    void checkSolution(SQPSolution& solution, double& min_error);
+    void checkSolution(SQPSolution& solution, InputArray objectPoints, double& min_error);
 
     /*
-    * @brief                Computes the determinant of a matrix stored in row-major format.
-    * @param e              Vector representing a 3x3 matrix stored in row-major format.
+    * @brief                Computes the determinant of a matrix stored in row-major order.
+    * @param e              Vector representing a 3x3 matrix stored in row-major order.
     * @return               The determinant of the matrix.
     */
     static double det3x3(const cv::Matx<double, 9, 1>& e);
 
     /*
-    * @brief                Tests the cheirality for a given solution.
+    * @brief                Tests the cheirality on the mean object point for a given solution.
     * @param solution       The solution to evaluate.
     */
     inline bool positiveDepth(const SQPSolution& solution) const;
 
     /*
-    * @brief                Determines the nearest rotation matrix to a given rotaiton matrix.
-    *                       Input and output are 9x1 vector representing a vector stored in row-major
-    *                       form.
-    * @param e              The input 3x3 matrix stored in a vector in row-major form.
-    * @param r              The nearest rotation matrix to the input e (again in row-major form).
+    * @brief                Tests the cheirality on all object points for a given solution.
+    * @param solution       The solution to evaluate.
+    * @param objectPoints   The 3D points in object coordinates.
+    */
+    inline bool positiveMajorityDepths(const SQPSolution& solution, InputArray objectPoints) const;
+
+    /*
+    * @brief                Determines the nearest rotation matrix to a given rotation matrix using SVD.
+    *                       Input and output are 9x1 vector representing a matrix stored in row-major
+    *                       order.
+    * @param e              The input 3x3 matrix stored in a vector in row-major order.
+    * @param r              The nearest rotation matrix to the input e (again in row-major order).
+    */
+    static void nearestRotationMatrixSVD(const cv::Matx<double, 9, 1>& e,
+        cv::Matx<double, 9, 1>& r);
+
+    /*
+    * @brief                Determines the nearest rotation matrix to a given rotation matrix using the FOAM algorithm.
+    *                       Input and output are 9x1 vector representing a matrix stored in row-major
+    *                       order.
+    * @param e              The input 3x3 matrix stored in a vector in row-major order.
+    * @param r              The nearest rotation matrix to the input e (again in row-major order).
     */
-    static void nearestRotationMatrix(const cv::Matx<double, 9, 1>& e,
+    static void nearestRotationMatrixFOAM(const cv::Matx<double, 9, 1>& e,
         cv::Matx<double, 9, 1>& r);
 
     /*

From 1099b4c881c458e302698a272f4f16dd46cac414 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Fri, 13 May 2022 18:14:45 +0300
Subject: [PATCH 036/178] Move environment variables on a host side (GHA
 Windows)

---
 .github/workflows/PR-3.4-W10.yaml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index 2cfed6a32e..528cd596d9 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -6,14 +6,13 @@ on:
       - 3.4
 
 env:
-  EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=c:\Slave\workspace\binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
+  EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=%BINARIES_CACHE% -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
   OPENCV_TEST_DATA_PATH: ${{ github.workspace }}\opencv_extra\testdata
   PR_AUTHOR: ${{ github.event.pull_request.user.login }}
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
   GTEST_FILTER_STRING: '-Samples.findFile'
-  GIT_CACHE: c:\Slave\git_cache
 
 jobs:
   BuildAndTest:
@@ -30,7 +29,7 @@ jobs:
     - name: Clean
       run: cd ${{ github.workspace }} && rm -rf *
     - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git git@github.com:opencv/opencv.git
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ github.workspace }}\opencv
@@ -38,7 +37,7 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Fetch opencv_extra
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_extra.git git@github.com:opencv/opencv_extra.git
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_extra.git git@github.com:opencv/opencv_extra.git
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       shell: bash
       run: |
@@ -145,7 +144,7 @@ jobs:
     - name: Clean
       run: cd ${{ github.workspace }} && rm -rf *
     - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv.git git@github.com:opencv/opencv.git
+      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ github.workspace }}\opencv
@@ -153,7 +152,7 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Fetch opencv_contrib
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE }}\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
+      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
     - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
       shell: bash
       run: |

From 07ad6a437e809c4275b8da5c992c20307e2b1ea0 Mon Sep 17 00:00:00 2001
From: fengyuentau <yuantao.feng@opencv.org.cn>
Date: Fri, 29 Apr 2022 11:04:06 +0800
Subject: [PATCH 037/178] add ARM64 workflow for 4.x

add opencv_contrib fork check

use image hosted on quay.io
---
 .github/workflows/PR-4.x-ARM64.yaml | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/PR-4.x-ARM64.yaml b/.github/workflows/PR-4.x-ARM64.yaml
index b2f5de1025..72af61f412 100644
--- a/.github/workflows/PR-4.x-ARM64.yaml
+++ b/.github/workflows/PR-4.x-ARM64.yaml
@@ -1,7 +1,9 @@
 name: PR:4.x ARM64
 
-# TODO: enable pipeline after 4.x update
-on: workflow_dispatch
+on:
+  pull_request:
+    branches:
+      - 4.x
 
 env:
   EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DENABLE_CCACHE=OFF'
@@ -21,7 +23,7 @@ jobs:
       run:
         shell: bash
     container:
-      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
+      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
     steps:
     - name: PR info
       run: |
@@ -87,18 +89,12 @@ jobs:
       run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect --gtest_filter="-Objdetect_QRCode_Close.regression/0:Objdetect_QRCode_Close.regression/4"
     - name: Accuracy:photo
       run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo --gtest_filter="-Photo_CalibrateDebevec.regression"
-    - name: Accuracy:shape
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_shape
     - name: Accuracy:stitching
       run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
-    - name: Accuracy:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_superres
     - name: Accuracy:video
       run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
     - name: Accuracy:videoio
       run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
-    - name: Accuracy:videostab
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videostab
     - name: Performance:calib3d
       run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
     - name: Performance:core
@@ -117,8 +113,6 @@ jobs:
       run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
     - name: Performance:stitching
       run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_superres --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
     - name: Performance:video
       run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
     - name: Performance:videoio

From 2aac0a5a26fa8f63cba8f23d01690791006fa7fb Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 15 May 2022 09:35:38 +0000
Subject: [PATCH 038/178] GHA: fix git merge

---
 .github/workflows/PR-3.4-W10.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index 528cd596d9..b8068cd05f 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -44,7 +44,7 @@ jobs:
         OPENCV_EXTRA_FORK=$(git ls-remote --heads "git@github.com:/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
         if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
           echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd opencv_extra
+          cd ${{ github.workspace }}\opencv_extra
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
           git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
@@ -159,7 +159,7 @@ jobs:
         OPENCV_CONTRIB_FORK=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
         if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
           echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
+          cd ${{ github.workspace }}\opencv_contrib
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
           git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"

From 22a00036e298daf3623ac6a7bc4d971d6058795e Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 15 May 2022 09:41:25 +0000
Subject: [PATCH 039/178] gapi(test): reduce used amount of memory

---
 modules/gapi/test/rmat/rmat_view_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gapi/test/rmat/rmat_view_tests.cpp b/modules/gapi/test/rmat/rmat_view_tests.cpp
index d829b6c655..f7458e8d94 100644
--- a/modules/gapi/test/rmat/rmat_view_tests.cpp
+++ b/modules/gapi/test/rmat/rmat_view_tests.cpp
@@ -132,7 +132,7 @@ TEST_P(RMatViewNDTest, StepFromView) {
 
 INSTANTIATE_TEST_CASE_P(Test, RMatViewNDTest,
                         Combine(Values(CV_8U, CV_32F), // depth
-                                Values(1,2,3,4,7)));   // ndims
+                                Values(1,2,3,4,5)));   // ndims
 
 struct RMatViewNDTestNegative : public TestWithParam<
     std::tuple<int /*depth*/, int /*chan*/, int /*ndims*/>>{};
@@ -153,7 +153,7 @@ TEST_P(RMatViewNDTestNegative, DefaultStep) {
 INSTANTIATE_TEST_CASE_P(Test, RMatViewNDTestNegative,
                         Combine(Values(CV_8U, CV_32F), // depth
                                 Values(1,2,3,4),       // chan
-                                Values(2,4,7)));       // ndims
+                                Values(2,4,5)));       // ndims
 
 TEST_P(RMatViewTest, NonDefaultStepInput) {
     auto type = GetParam();

From 3f4ffe7844cead4e406bfb0067e9dae2ff9247f3 Mon Sep 17 00:00:00 2001
From: dbudnikov <103500503+dbudnikov@users.noreply.github.com>
Date: Sun, 15 May 2022 13:07:01 +0300
Subject: [PATCH 040/178] Merge pull request #21909 from
 dbudnikov:dbudnikov/reduce_gapi_tests_number

Reduce G-API tests number

* reduce number of G-API tests

* reduce number of G-API tests

* Address more comments from Dmitry.
---
 modules/gapi/test/cpu/gapi_core_tests_cpu.cpp | 190 +++++-------------
 .../gapi/test/cpu/gapi_core_tests_fluid.cpp   | 167 ++++-----------
 .../gapi/test/cpu/gapi_imgproc_tests_cpu.cpp  | 153 +++++---------
 .../test/cpu/gapi_imgproc_tests_fluid.cpp     |  70 ++-----
 .../test/cpu/gapi_operators_tests_cpu.cpp     |  31 +--
 .../test/cpu/gapi_operators_tests_fluid.cpp   |  31 +--
 modules/gapi/test/gpu/gapi_core_tests_gpu.cpp | 169 ++++------------
 .../gapi/test/gpu/gapi_imgproc_tests_gpu.cpp  |  88 +++-----
 .../test/gpu/gapi_operators_tests_gpu.cpp     |  21 +-
 9 files changed, 257 insertions(+), 663 deletions(-)

diff --git a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
index 59b9eba4cb..d8bf970bb8 100644
--- a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
@@ -12,6 +12,7 @@
 namespace
 {
 #define CORE_CPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::cpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -20,9 +21,7 @@ namespace opencv_test
 // FIXME: Wut? See MulTestCPU/MathOpTest below (duplicate?)
 INSTANTIATE_TEST_CASE_P(AddTestCPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU),
                                 Values(ADD, MUL),
@@ -32,9 +31,7 @@ INSTANTIATE_TEST_CASE_P(AddTestCPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(MulTestCPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU),
                                 Values(MUL),
@@ -44,9 +41,7 @@ INSTANTIATE_TEST_CASE_P(MulTestCPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(SubTestCPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU),
                                 Values(SUB),
@@ -56,9 +51,7 @@ INSTANTIATE_TEST_CASE_P(SubTestCPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(DivTestCPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU),
                                 Values(DIV),
@@ -68,90 +61,68 @@ INSTANTIATE_TEST_CASE_P(DivTestCPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(MulTestCPU, MulDoubleTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(DivTestCPU, DivTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(DivCTestCPU, DivCTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(MeanTestCPU, MeanTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(MaskTestCPU, MaskTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(SelectTestCPU, SelectTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Polar2CartCPU, Polar2CartTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Cart2PolarCPU, Cart2PolarTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(PhaseCPU, PhaseTest,
                         Combine(Values(CV_32F, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
          /* angle_in_degrees */ testing::Bool()));
 
 INSTANTIATE_TEST_CASE_P(SqrtCPU, SqrtTest,
                         Combine(Values(CV_32F, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(CompareTestCPU, CmpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U),
                                 Values(CORE_CPU),
                                 Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
@@ -160,9 +131,7 @@ INSTANTIATE_TEST_CASE_P(CompareTestCPU, CmpTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseTestCPU, BitwiseTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AND, OR, XOR),
@@ -170,33 +139,25 @@ INSTANTIATE_TEST_CASE_P(BitwiseTestCPU, BitwiseTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotTestCPU, NotTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(MinTestCPU, MinTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(MaxTestCPU, MaxTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(SumTestCPU, SumTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 //Values(1e-5),
                                 Values(CORE_CPU),
@@ -204,43 +165,33 @@ INSTANTIATE_TEST_CASE_P(SumTestCPU, SumTest,
 
 INSTANTIATE_TEST_CASE_P(CountNonZeroTestCPU, CountNonZeroTest,
                         Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffTestCPU, AbsDiffTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCTestCPU, AbsDiffCTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(AddWeightedTestCPU, AddWeightedTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_CPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj()),
@@ -248,17 +199,13 @@ INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest,
 
 INSTANTIATE_TEST_CASE_P(IntegralTestCPU, IntegralTest,
                         Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC,
@@ -269,9 +216,7 @@ INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdTest,
 
 INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdOTTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE)));
@@ -279,99 +224,75 @@ INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdOTTest,
 
 INSTANTIATE_TEST_CASE_P(InRangeTestCPU, InRangeTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Split3TestCPU, Split3Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Split4TestCPU, Split4Test,
                         Combine(Values(CV_8UC4),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Merge3TestCPU, Merge3Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC3),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(Merge4TestCPU, Merge4Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC4),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(RemapTestCPU, RemapTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(FlipTestCPU, FlipTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(0,1,-1)));
 
 INSTANTIATE_TEST_CASE_P(CropTestCPU, CropTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50))));
 
 INSTANTIATE_TEST_CASE_P(CopyTestCPU, CopyTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(LUTTestCPU, LUTTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(LUTTestCustomCPU, LUTTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC3),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(ConvertToCPU, ConvertToTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U, CV_16U, CV_16S, CV_32F),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -380,40 +301,31 @@ INSTANTIATE_TEST_CASE_P(ConvertToCPU, ConvertToTest,
 
 INSTANTIATE_TEST_CASE_P(ConcatHorTestCPU, ConcatHorTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(ConcatVertTestCPU, ConcatVertTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(ConcatVertVecTestCPU, ConcatVertVecTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(ConcatHorVecTestCPU, ConcatHorVecTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 
 INSTANTIATE_TEST_CASE_P(WarpPerspectiveTestCPU, WarpPerspectiveTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -425,8 +337,7 @@ INSTANTIATE_TEST_CASE_P(WarpPerspectiveTestCPU, WarpPerspectiveTest,
 
 INSTANTIATE_TEST_CASE_P(WarpAffineTestCPU, WarpAffineTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -438,8 +349,7 @@ INSTANTIATE_TEST_CASE_P(WarpAffineTestCPU, WarpAffineTest,
 
 INSTANTIATE_TEST_CASE_P(NormalizeTestCPU, NormalizeTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -519,9 +429,7 @@ INSTANTIATE_TEST_CASE_P(TransposeTestCPU, TransposeTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
                                        CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
                                        CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj())));
diff --git a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
index f273da5bdb..b61ff95147 100644
--- a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
@@ -11,6 +11,7 @@
 namespace
 {
 #define CORE_FLUID [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::fluid::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -18,10 +19,7 @@ namespace opencv_test
 // FIXME: Windows accuracy problems after recent update!
 INSTANTIATE_TEST_CASE_P(MathOpTestFluid, MathOpTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_8U, CV_32F),
                                 Values(CORE_FLUID),
                                 Values(DIV, MUL),
@@ -37,10 +35,7 @@ INSTANTIATE_TEST_CASE_P(MathOpTestFluid, MathOpTest,
 //        Github ticket: https://github.com/opencv/opencv/issues/18373.
 INSTANTIATE_TEST_CASE_P(DISABLED_MathOpTestFluid, MathOpTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_8U, CV_32F),
                                 Values(CORE_FLUID),
                                 Values(ADD, SUB),
@@ -54,9 +49,7 @@ INSTANTIATE_TEST_CASE_P(DISABLED_MathOpTestFluid, MathOpTest,
 //        Github ticket: https://github.com/opencv/opencv/issues/18373.
 INSTANTIATE_TEST_CASE_P(DISABLED_SubTestFluid, MathOpTest,
                         Combine(Values(CV_8UC1, CV_16SC1 , CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_8U, CV_32F),
                                 Values(CORE_FLUID),
                                 Values(SUB),
@@ -66,41 +59,31 @@ INSTANTIATE_TEST_CASE_P(DISABLED_SubTestFluid, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(MulSTestFluid, MulDoubleTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1), // FIXME: extend with more types
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DivCTestFluid, DivCTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U, CV_32F),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_MeanTestFluid, MeanTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(MaskTestFluid, MaskTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
@@ -108,18 +91,13 @@ INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
                                        CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
                                        CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(BitwiseTestFluid, BitwiseTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AND, OR, XOR),
@@ -127,46 +105,32 @@ INSTANTIATE_TEST_CASE_P(BitwiseTestFluid, BitwiseTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotTestFluid, NotTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(MinTestFluid, MinTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(MaxTestFluid, MaxTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_SumTestFluid, SumTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(CompareTestFluid, CmpTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U),
                                 Values(CORE_FLUID),
                                 Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
@@ -176,10 +140,7 @@ INSTANTIATE_TEST_CASE_P(CompareTestFluid, CmpTest,
 // FIXME: solve comparison error to unite with the test above
 INSTANTIATE_TEST_CASE_P(CompareTestFluidScalar, CmpTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U),
                                 Values(CORE_FLUID),
                                 Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
@@ -188,18 +149,14 @@ INSTANTIATE_TEST_CASE_P(CompareTestFluidScalar, CmpTest,
 
 INSTANTIATE_TEST_CASE_P(AddWeightedTestFluid, AddWeightedTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_8U, CV_32F),
                                 Values(CORE_FLUID),
                                 Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_NormTestFluid, NormTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj()),
@@ -207,26 +164,19 @@ INSTANTIATE_TEST_CASE_P(DISABLED_NormTestFluid, NormTest,
 
 INSTANTIATE_TEST_CASE_P(DISABLED_IntegralTestFluid, IntegralTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(LUTTestFluid, LUTTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(ConvertToFluid, ConvertToTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U, CV_16U, CV_16S, CV_32F),
                                 Values(CORE_FLUID),
                                 Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_obj()),
@@ -235,128 +185,94 @@ INSTANTIATE_TEST_CASE_P(ConvertToFluid, ConvertToTest,
 
 INSTANTIATE_TEST_CASE_P(DISABLED_ConcatHorTestFluid, ConcatHorTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_ConcatVertTestFluid, ConcatVertTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Split3TestFluid, Split3Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Split4TestFluid, Split4Test,
                         Combine(Values(CV_8UC4),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Merge3TestFluid, Merge3Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC3),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Merge4TestFluid, Merge4Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC4),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_RemapTestFluid, RemapTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_FlipTestFluid, FlipTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(0,1,-1)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_CropTestFluid, CropTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50))));
 
 INSTANTIATE_TEST_CASE_P(SelectTestFluid, SelectTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Polar2CartFluid, Polar2CartTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(Cart2PolarFluid, Cart2PolarTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(PhaseFluid, PhaseTest,
                         Combine(Values(CV_32F, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
          /* angle_in_degrees */ testing::Bool()));
 
 INSTANTIATE_TEST_CASE_P(SqrtFluid, SqrtTest,
                         Combine(Values(CV_32F, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(ThresholdTestFluid, ThresholdTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV,
@@ -368,19 +284,14 @@ INSTANTIATE_TEST_CASE_P(ThresholdTestFluid, ThresholdTest,
 
 INSTANTIATE_TEST_CASE_P(DISABLED_ThresholdTestFluid, ThresholdOTTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE)));
 
 INSTANTIATE_TEST_CASE_P(InRangeTestFluid, InRangeTest,
                         Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1920, 1080),
-                                       cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID)));
 
diff --git a/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
index 1fa9231d19..93df74e98f 100644
--- a/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
@@ -13,6 +13,7 @@
 namespace
 {
 #define IMGPROC_CPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::imgproc::cpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -20,9 +21,7 @@ namespace opencv_test
 
 INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsSimilarPoints(2, 0.05).to_compare_obj()),
@@ -32,9 +31,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTest,
 
 INSTANTIATE_TEST_CASE_P(ResizePTestCPU, ResizePTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsSimilarPoints(2, 0.05).to_compare_obj()),
@@ -44,9 +41,7 @@ INSTANTIATE_TEST_CASE_P(ResizePTestCPU, ResizePTest,
 
 INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTestFxFy,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsSimilarPoints(2, 0.05).to_compare_obj()),
@@ -56,9 +51,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTestFxFy,
 
 INSTANTIATE_TEST_CASE_P(Filter2DTestCPU, Filter2DTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                        cv::Size(640, 480),
-                                        cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -70,8 +63,7 @@ INSTANTIATE_TEST_CASE_P(Filter2DTestCPU, Filter2DTest,
 
 INSTANTIATE_TEST_CASE_P(BoxFilterTestCPU, BoxFilterTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsTolerance(0).to_compare_obj()),
@@ -80,8 +72,7 @@ INSTANTIATE_TEST_CASE_P(BoxFilterTestCPU, BoxFilterTest,
 
 INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_8U, SepFilterTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                        cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -89,8 +80,7 @@ INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_8U, SepFilterTest,
 
 INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_other, SepFilterTest,
                         Combine(Values(CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -98,8 +88,7 @@ INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_other, SepFilterTest,
 
 INSTANTIATE_TEST_CASE_P(BlurTestCPU, BlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsTolerance(0.0).to_compare_obj()),
@@ -108,8 +97,7 @@ INSTANTIATE_TEST_CASE_P(BlurTestCPU, BlurTest,
 
 INSTANTIATE_TEST_CASE_P(gaussBlurTestCPU, GaussianBlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -117,8 +105,7 @@ INSTANTIATE_TEST_CASE_P(gaussBlurTestCPU, GaussianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(MedianBlurTestCPU, MedianBlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -126,8 +113,7 @@ INSTANTIATE_TEST_CASE_P(MedianBlurTestCPU, MedianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(ErodeTestCPU, ErodeTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -138,8 +124,7 @@ INSTANTIATE_TEST_CASE_P(ErodeTestCPU, ErodeTest,
 
 INSTANTIATE_TEST_CASE_P(Erode3x3TestCPU, Erode3x3Test,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -147,8 +132,7 @@ INSTANTIATE_TEST_CASE_P(Erode3x3TestCPU, Erode3x3Test,
 
 INSTANTIATE_TEST_CASE_P(DilateTestCPU, DilateTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -159,8 +143,7 @@ INSTANTIATE_TEST_CASE_P(DilateTestCPU, DilateTest,
 
 INSTANTIATE_TEST_CASE_P(Dilate3x3TestCPU, Dilate3x3Test,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -168,8 +151,7 @@ INSTANTIATE_TEST_CASE_P(Dilate3x3TestCPU, Dilate3x3Test,
 
 INSTANTIATE_TEST_CASE_P(MorphologyExTestCPU, MorphologyExTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -183,8 +165,7 @@ INSTANTIATE_TEST_CASE_P(MorphologyExTestCPU, MorphologyExTest,
 
 INSTANTIATE_TEST_CASE_P(MorphologyExHitMissTestCPU, MorphologyExTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -192,8 +173,7 @@ INSTANTIATE_TEST_CASE_P(MorphologyExHitMissTestCPU, MorphologyExTest,
 
 INSTANTIATE_TEST_CASE_P(SobelTestCPU, SobelTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -203,8 +183,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestCPU, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(SobelTestCPU32F, SobelTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -214,8 +193,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestCPU32F, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(SobelXYTestCPU, SobelXYTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -226,8 +204,7 @@ INSTANTIATE_TEST_CASE_P(SobelXYTestCPU, SobelXYTest,
 
 INSTANTIATE_TEST_CASE_P(SobelXYTestCPU32F, SobelXYTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32F),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -238,8 +215,7 @@ INSTANTIATE_TEST_CASE_P(SobelXYTestCPU32F, SobelXYTest,
 
 INSTANTIATE_TEST_CASE_P(LaplacianTestCPU, LaplacianTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -249,8 +225,7 @@ INSTANTIATE_TEST_CASE_P(LaplacianTestCPU, LaplacianTest,
 
 INSTANTIATE_TEST_CASE_P(BilateralFilterTestCPU, BilateralFilterTest,
                         Combine(Values(CV_32FC1, CV_32FC3, CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -261,16 +236,14 @@ INSTANTIATE_TEST_CASE_P(BilateralFilterTestCPU, BilateralFilterTest,
 
 INSTANTIATE_TEST_CASE_P(EqHistTestCPU, EqHistTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(CannyTestCPU, CannyTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsSimilarPoints(0, 0.05).to_compare_obj()),
@@ -314,8 +287,7 @@ INSTANTIATE_TEST_CASE_P(FindContoursOffsetTestCPU, FindContoursOffsetTest,
 
 INSTANTIATE_TEST_CASE_P(FindContoursHNoOffsetTestCPU, FindContoursHNoOffsetTest,
                         Combine(Values(IMGPROC_CPU),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE),
                                 Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
@@ -324,8 +296,7 @@ INSTANTIATE_TEST_CASE_P(FindContoursHNoOffsetTestCPU, FindContoursHNoOffsetTest,
 
 INSTANTIATE_TEST_CASE_P(FindContoursHNoOffset32STestCPU, FindContoursHNoOffsetTest,
                         Combine(Values(IMGPROC_CPU),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32SC1),
                                 Values(RETR_CCOMP, RETR_FLOODFILL),
                                 Values(CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE,
@@ -337,9 +308,7 @@ INSTANTIATE_TEST_CASE_P(FindContoursHOffsetTestCPU, FindContoursHOffsetTest,
 
 INSTANTIATE_TEST_CASE_P(BoundingRectMatTestCPU, BoundingRectMatTest,
                         Combine(Values( CV_8UC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_CPU),
                                 Values(IoUToleranceRect(0).to_compare_obj()),
@@ -440,176 +409,154 @@ INSTANTIATE_TEST_CASE_P(FitLine3DVector64FTestCPU, FitLine3DVector64FTest,
 
 INSTANTIATE_TEST_CASE_P(BGR2RGBTestCPU, BGR2RGBTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2GrayTestCPU, RGB2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2GrayTestCPU, BGR2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2YUVTestCPU, RGB2YUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(YUV2RGBTestCPU, YUV2RGBTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2I420TestCPU, BGR2I420Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2I420TestCPU, RGB2I420Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(I4202BGRTestCPU, I4202BGRTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(I4202RGBTestCPU, I4202RGBTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NV12toRGBTestCPU, NV12toRGBTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NV12toBGRTestCPU, NV12toBGRTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NV12toGrayTestCPU, NV12toGrayTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NV12toRGBpTestCPU, NV12toRGBpTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NV12toBGRpTestCPU, NV12toBGRpTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2LabTestCPU, RGB2LabTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2LUVTestCPU, BGR2LUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(LUV2BGRTestCPU, LUV2BGRTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2YUVTestCPU, BGR2YUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(YUV2BGRTestCPU, YUV2BGRTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2HSVTestCPU, RGB2HSVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BayerGR2RGBTestCPU, BayerGR2RGBTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_CPU),
                                 Values(AbsExact().to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2YUV422TestCPU, RGB2YUV422Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC2),
                                 Values(IMGPROC_CPU),
                                 Values(AbsTolerance(1).to_compare_obj())));
diff --git a/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp
index 29297fc5e6..1b4c351232 100644
--- a/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp
+++ b/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp
@@ -19,26 +19,17 @@ namespace opencv_test
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid, ResizeTest,
                         Combine(Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/),
                                 Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128),
-                                       cv::Size(64, 64),
                                        cv::Size(30, 30)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_obj()),
                                 Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
                                 Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128),
-                                       cv::Size(64, 64),
                                        cv::Size(30, 30))));
 
 INSTANTIATE_TEST_CASE_P(ResizeTestFxFyFluid, ResizeTestFxFy,
                         Combine(Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/),
                                 Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128),
-                                       cv::Size(64, 64),
                                        cv::Size(30, 30)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
@@ -49,40 +40,35 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFxFyFluid, ResizeTestFxFy,
 
 INSTANTIATE_TEST_CASE_P(RGB2GrayTestFluid, RGB2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2GrayTestFluid, BGR2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2YUVTestFluid, RGB2YUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(YUV2RGBTestFluid, YUV2RGBTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2LabTestFluid, RGB2LabTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsSimilarPoints(1, 0.05).to_compare_obj())));
@@ -90,40 +76,35 @@ INSTANTIATE_TEST_CASE_P(RGB2LabTestFluid, RGB2LabTest,
 // FIXME: Not supported by Fluid yet (no kernel implemented)
 INSTANTIATE_TEST_CASE_P(BGR2LUVTestFluid, BGR2LUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(5e-3, 6).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2HSVTestFluid, RGB2HSVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BayerGR2RGBTestFluid, BayerGR2RGBTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2YUV422TestFluid, RGB2YUV422Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC2),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsTolerance(1).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(blurTestFluid, BlurTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -132,8 +113,7 @@ INSTANTIATE_TEST_CASE_P(blurTestFluid, BlurTest,
 
 INSTANTIATE_TEST_CASE_P(gaussBlurTestFluid, GaussianBlurTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-3f, 0.01).to_compare_obj()),
@@ -141,8 +121,7 @@ INSTANTIATE_TEST_CASE_P(gaussBlurTestFluid, GaussianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(medianBlurTestFluid, MedianBlurTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -150,8 +129,7 @@ INSTANTIATE_TEST_CASE_P(medianBlurTestFluid, MedianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(erodeTestFluid, ErodeTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -162,8 +140,7 @@ INSTANTIATE_TEST_CASE_P(erodeTestFluid, ErodeTest,
 
 INSTANTIATE_TEST_CASE_P(dilateTestFluid, DilateTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -174,8 +151,7 @@ INSTANTIATE_TEST_CASE_P(dilateTestFluid, DilateTest,
 
 INSTANTIATE_TEST_CASE_P(SobelTestFluid, SobelTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -185,8 +161,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestFluid, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(SobelTestFluid32F, SobelTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -196,8 +171,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestFluid32F, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(SobelXYTestFluid, SobelXYTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -208,8 +182,7 @@ INSTANTIATE_TEST_CASE_P(SobelXYTestFluid, SobelXYTest,
 
 INSTANTIATE_TEST_CASE_P(SobelXYTestFluid32F, SobelXYTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -220,8 +193,7 @@ INSTANTIATE_TEST_CASE_P(SobelXYTestFluid32F, SobelXYTest,
 
 INSTANTIATE_TEST_CASE_P(boxFilterTestFluid32, BoxFilterTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -230,8 +202,7 @@ INSTANTIATE_TEST_CASE_P(boxFilterTestFluid32, BoxFilterTest,
 
 INSTANTIATE_TEST_CASE_P(sepFilterTestFluid, SepFilterTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_FLUID),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -240,7 +211,6 @@ INSTANTIATE_TEST_CASE_P(sepFilterTestFluid, SepFilterTest,
 INSTANTIATE_TEST_CASE_P(filter2DTestFluid, Filter2DTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
                                 Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
                                        cv::Size(128, 128)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_FLUID),
diff --git a/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp
index df74a046cd..5e070f8ce5 100644
--- a/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp
@@ -12,6 +12,7 @@
 namespace
 {
 #define CORE_CPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::cpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -19,21 +20,17 @@ namespace opencv_test
 
 // FIXME: CPU test runs are disabled since Fluid is an exclusive plugin now!
 INSTANTIATE_TEST_CASE_P(MathOperatorTestCPU, MathOperatorMatMatTest,
-                    Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                            Values(cv::Size(1280, 720),
-                                   cv::Size(640, 480),
-                                   cv::Size(128, 128)),
-                            Values(-1),
-                            Values(CORE_CPU),
-                            Values(AbsExact().to_compare_obj()),
-                            Values( ADD, SUB, DIV,
+                        Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                ValuesIn(in_sizes),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(AbsExact().to_compare_obj()),
+                                Values( ADD, SUB, DIV,
                                     GT, LT, GE, LE, EQ, NE)));
 
 INSTANTIATE_TEST_CASE_P(MathOperatorTestCPU, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -44,9 +41,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorTestCPU, MathOperatorMatScalarTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatMatTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -54,9 +49,7 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatMatTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -65,9 +58,7 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatScalarTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestCPU, NotOperatorTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_CPU)));
 }
diff --git a/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp
index d076ecdd8e..af0f2f136b 100644
--- a/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp
+++ b/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp
@@ -11,6 +11,7 @@
 namespace
 {
 #define CORE_FLUID [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::fluid::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -18,9 +19,7 @@ namespace opencv_test
 
 INSTANTIATE_TEST_CASE_P(MathOperatorTestFluid, MathOperatorMatMatTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -29,9 +28,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorTestFluid, MathOperatorMatMatTest,
 
 INSTANTIATE_TEST_CASE_P(MathOperatorArithmeticTestFluid, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -41,9 +38,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorArithmeticTestFluid, MathOperatorMatScalarTe
   // FIXME: solve comparison error
 INSTANTIATE_TEST_CASE_P(MathOperatorCompareTestFluid, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsSimilarPoints(1, 0.01).to_compare_obj()),
@@ -52,9 +47,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorCompareTestFluid, MathOperatorMatScalarTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestFluid, MathOperatorMatMatTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -62,9 +55,7 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestFluid, MathOperatorMatMatTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestFluid, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_FLUID),
                                 Values(AbsExact().to_compare_obj()),
@@ -72,10 +63,8 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestFluid, MathOperatorMatScalarTest,
                                         ANDR, ORR, XORR )));
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestFluid, NotOperatorTest,
-                    Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                            Values(cv::Size(1280, 720),
-                                   cv::Size(640, 480),
-                                   cv::Size(128, 128)),
-                            Values(-1),
-                            Values(CORE_FLUID)));
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                ValuesIn(in_sizes),
+                                Values(-1),
+                                Values(CORE_FLUID)));
 }
diff --git a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
index d3e7cc6ddc..f32f1adee9 100644
--- a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
+++ b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
@@ -11,6 +11,7 @@
 namespace
 {
 #define CORE_GPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::gpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -19,9 +20,7 @@ namespace opencv_test
 // FIXME: Wut? See MulTestGPU/MathOpTest below (duplicate?)
 INSTANTIATE_TEST_CASE_P(AddTestGPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU),
                                 Values(ADD, MUL),
@@ -31,9 +30,7 @@ INSTANTIATE_TEST_CASE_P(AddTestGPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(MulTestGPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU),
                                 Values(MUL),
@@ -43,9 +40,7 @@ INSTANTIATE_TEST_CASE_P(MulTestGPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(SubTestGPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU),
                                 Values(SUB),
@@ -65,9 +60,7 @@ INSTANTIATE_TEST_CASE_P(SubTestGPU, MathOpTest,
 //        Github ticket: https://github.com/opencv/opencv/issues/18373.
 INSTANTIATE_TEST_CASE_P(DISABLED_DivTestGPU, MathOpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU),
                                 Values(DIV),
@@ -77,74 +70,56 @@ INSTANTIATE_TEST_CASE_P(DISABLED_DivTestGPU, MathOpTest,
 
 INSTANTIATE_TEST_CASE_P(MulTestGPU, MulDoubleTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(DivTestGPU, DivTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(DivCTestGPU, DivCTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(MeanTestGPU, MeanTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 //TODO: mask test doesn't work
 INSTANTIATE_TEST_CASE_P(DISABLED_MaskTestGPU, MaskTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(SelectTestGPU, SelectTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Polar2CartGPU, Polar2CartTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Cart2PolarGPU, Cart2PolarTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_32FC1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(CompareTestGPU, CmpTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U),
                                 Values(CORE_GPU),
                                 Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
@@ -153,9 +128,7 @@ INSTANTIATE_TEST_CASE_P(CompareTestGPU, CmpTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseTestGPU, BitwiseTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AND, OR, XOR),
@@ -163,76 +136,58 @@ INSTANTIATE_TEST_CASE_P(BitwiseTestGPU, BitwiseTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotTestGPU, NotTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_MinTestGPU, MinTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_MaxTestGPU, MaxTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(SumTestGPU, SumTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(CountNonZeroTestGPU, CountNonZeroTest,
                         Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsToleranceScalar(1e-5).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffTestGPU, AbsDiffTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCTestGPU, AbsDiffCTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(AddWeightedTestGPU, AddWeightedTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
                                 Values(CORE_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(NormTestGPU, NormTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsToleranceScalar(1e-3).to_compare_obj()), //TODO: too relaxed?
@@ -240,17 +195,13 @@ INSTANTIATE_TEST_CASE_P(NormTestGPU, NormTest,
 
 INSTANTIATE_TEST_CASE_P(IntegralTestGPU, IntegralTest,
                         Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC,
@@ -261,9 +212,7 @@ INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdTest,
 
 INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdOTTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE)));
@@ -271,91 +220,69 @@ INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdOTTest,
 
 INSTANTIATE_TEST_CASE_P(InRangeTestGPU, InRangeTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Split3TestGPU, Split3Test,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Split4TestGPU, Split4Test,
                         Combine(Values(CV_8UC4),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Merge3TestGPU, Merge3Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC3),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(Merge4TestGPU, Merge4Test,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC4),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(RemapTestGPU, RemapTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(FlipTestGPU, FlipTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(0,1,-1)));
 
 INSTANTIATE_TEST_CASE_P(CropTestGPU, CropTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50))));
 
 INSTANTIATE_TEST_CASE_P(LUTTestGPU, LUTTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(LUTTestCustomGPU, LUTTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8UC3),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(ConvertToGPU, ConvertToTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(CV_8U, CV_16U, CV_16S, CV_32F),
                                 Values(CORE_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -364,17 +291,13 @@ INSTANTIATE_TEST_CASE_P(ConvertToGPU, ConvertToTest,
 
 INSTANTIATE_TEST_CASE_P(ConcatHorTestGPU, ConcatHorTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(ConcatVertTestGPU, ConcatVertTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
@@ -382,9 +305,7 @@ INSTANTIATE_TEST_CASE_P(TransposeTestGPU, TransposeTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
                                        CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
                                        CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsExact().to_compare_obj())));
@@ -415,17 +336,13 @@ INSTANTIATE_TEST_CASE_P(ReInitOutTestGPU, ReInitOutTest,
 //TODO: fix this backend to allow ConcatVertVec ConcatHorVec
 INSTANTIATE_TEST_CASE_P(DISABLED_ConcatVertVecTestGPU, ConcatVertVecTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 
 INSTANTIATE_TEST_CASE_P(DISABLED_ConcatHorVecTestGPU, ConcatHorVecTest,
                         Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 }
diff --git a/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp b/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp
index ba5d67e517..bd9452a795 100644
--- a/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp
+++ b/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp
@@ -12,6 +12,7 @@
 namespace
 {
 #define IMGPROC_GPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::imgproc::gpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -19,9 +20,7 @@ namespace opencv_test
 
 INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsSimilarPoints(2, 0.05).to_compare_obj()),
@@ -31,9 +30,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTest,
 
 INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTestFxFy,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsSimilarPoints(2, 0.05).to_compare_obj()),
@@ -43,9 +40,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTestFxFy,
 
 INSTANTIATE_TEST_CASE_P(Filter2DTestGPU, Filter2DTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_obj()),
@@ -57,8 +52,7 @@ INSTANTIATE_TEST_CASE_P(Filter2DTestGPU, Filter2DTest,
 
 INSTANTIATE_TEST_CASE_P(BoxFilterTestGPU, BoxFilterTest,
                         Combine(Values(/*CV_8UC1,*/ CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_obj()),
@@ -68,8 +62,7 @@ INSTANTIATE_TEST_CASE_P(BoxFilterTestGPU, BoxFilterTest,
 
 INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_8U, SepFilterTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -77,8 +70,7 @@ INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_8U, SepFilterTest,
 
 INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_other, SepFilterTest,
                         Combine(Values(CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceFilter(1e-4f, 0.01).to_compare_obj()),
@@ -86,8 +78,7 @@ INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_other, SepFilterTest,
 
 INSTANTIATE_TEST_CASE_P(BlurTestGPU, BlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -96,8 +87,7 @@ INSTANTIATE_TEST_CASE_P(BlurTestGPU, BlurTest,
 
 INSTANTIATE_TEST_CASE_P(gaussBlurTestGPU, GaussianBlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceFilter(1e-5f, 0.01).to_compare_obj()),
@@ -105,8 +95,7 @@ INSTANTIATE_TEST_CASE_P(gaussBlurTestGPU, GaussianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(MedianBlurTestGPU, MedianBlurTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -114,8 +103,7 @@ INSTANTIATE_TEST_CASE_P(MedianBlurTestGPU, MedianBlurTest,
 
 INSTANTIATE_TEST_CASE_P(ErodeTestGPU, ErodeTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -126,8 +114,7 @@ INSTANTIATE_TEST_CASE_P(ErodeTestGPU, ErodeTest,
 
 INSTANTIATE_TEST_CASE_P(Erode3x3TestGPU, Erode3x3Test,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -135,8 +122,7 @@ INSTANTIATE_TEST_CASE_P(Erode3x3TestGPU, Erode3x3Test,
 
 INSTANTIATE_TEST_CASE_P(DilateTestGPU, DilateTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -147,8 +133,7 @@ INSTANTIATE_TEST_CASE_P(DilateTestGPU, DilateTest,
 
 INSTANTIATE_TEST_CASE_P(Dilate3x3TestGPU, Dilate3x3Test,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -156,8 +141,7 @@ INSTANTIATE_TEST_CASE_P(Dilate3x3TestGPU, Dilate3x3Test,
 
 INSTANTIATE_TEST_CASE_P(SobelTestGPU, SobelTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1, CV_16S, CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -167,8 +151,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestGPU, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(SobelTestGPU32F, SobelTest,
                         Combine(Values(CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_32F),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -178,8 +161,7 @@ INSTANTIATE_TEST_CASE_P(SobelTestGPU32F, SobelTest,
 
 INSTANTIATE_TEST_CASE_P(LaplacianTestGPU, LaplacianTest,
                         Combine(Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -189,8 +171,7 @@ INSTANTIATE_TEST_CASE_P(LaplacianTestGPU, LaplacianTest,
 
 INSTANTIATE_TEST_CASE_P(BilateralFilterTestGPU, BilateralFilterTest,
                         Combine(Values(CV_32FC1, CV_32FC3, CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -201,16 +182,14 @@ INSTANTIATE_TEST_CASE_P(BilateralFilterTestGPU, BilateralFilterTest,
 
 INSTANTIATE_TEST_CASE_P(EqHistTestGPU, EqHistTest,
                         Combine(Values(CV_8UC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(-1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsExact().to_compare_obj())));  // FIXIT Non reliable check
 
 INSTANTIATE_TEST_CASE_P(CannyTestGPU, CannyTest,
                         Combine(Values(CV_8UC1, CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_GPU),
                                 Values(AbsSimilarPoints(0, 0.05).to_compare_obj()),
@@ -221,72 +200,63 @@ INSTANTIATE_TEST_CASE_P(CannyTestGPU, CannyTest,
 
 INSTANTIATE_TEST_CASE_P(RGB2GrayTestGPU, RGB2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2GrayTestGPU, BGR2GrayTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC1),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2YUVTestGPU, RGB2YUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(YUV2RGBTestGPU, YUV2RGBTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(RGB2LabTestGPU, RGB2LabTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(AbsSimilarPoints(1, 0.05).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2LUVTestGPU, BGR2LUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(5e-3, 6).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(LUV2BGRTestGPU, LUV2BGRTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(BGR2YUVTestGPU, BGR2YUVTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
 
 INSTANTIATE_TEST_CASE_P(YUV2BGRTestGPU, YUV2BGRTest,
                         Combine(Values(CV_8UC3),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480)),
+                                Values(cv::Size(1280, 720)),
                                 Values(CV_8UC3),
                                 Values(IMGPROC_GPU),
                                 Values(ToleranceColor(1e-3).to_compare_obj())));
diff --git a/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp b/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp
index 46a2155be4..8184ff8c45 100644
--- a/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp
+++ b/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp
@@ -11,6 +11,7 @@
 namespace
 {
 #define CORE_GPU [] () { return cv::compile_args(cv::gapi::use_only{cv::gapi::core::gpu::kernels()}); }
+    const std::vector <cv::Size> in_sizes{ cv::Size(1280, 720), cv::Size(128, 128) };
 }  // anonymous namespace
 
 namespace opencv_test
@@ -18,9 +19,7 @@ namespace opencv_test
 
 INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatMatTest,
                     Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                            Values(cv::Size(1280, 720),
-                                   cv::Size(640, 480),
-                                   cv::Size(128, 128)),
+                            ValuesIn(in_sizes),
                             Values(-1),
                             Values(CORE_GPU),
                             Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_obj()),
@@ -29,9 +28,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatMatTest,
 
 INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_obj()),
@@ -42,9 +39,7 @@ INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatScalarTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatMatTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -52,9 +47,7 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatMatTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatScalarTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU),
                                 Values(AbsExact().to_compare_obj()),
@@ -63,9 +56,7 @@ INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatScalarTest,
 
 INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestGPU, NotOperatorTest,
                         Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
-                                Values(cv::Size(1280, 720),
-                                       cv::Size(640, 480),
-                                       cv::Size(128, 128)),
+                                ValuesIn(in_sizes),
                                 Values(-1),
                                 Values(CORE_GPU)));
 }

From 89f8d4ae1235df19f49ba3167f0e4f2b4acc5c0a Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 15 May 2022 16:20:45 +0000
Subject: [PATCH 041/178] build: GCC12 warnings

---
 3rdparty/libpng/CMakeLists.txt          | 4 +++-
 3rdparty/libtiff/CMakeLists.txt         | 3 ++-
 apps/traincascade/cascadeclassifier.cpp | 2 +-
 modules/ts/src/ts_gtest.cpp             | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/3rdparty/libpng/CMakeLists.txt b/3rdparty/libpng/CMakeLists.txt
index efa59627eb..f72b966079 100644
--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@@ -77,7 +77,9 @@ endif(MSVC)
 add_library(${PNG_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${lib_srcs} ${lib_hdrs})
 target_link_libraries(${PNG_LIBRARY} ${ZLIB_LIBRARIES})
 
-ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wcast-align -Wimplicit-fallthrough -Wunused-parameter -Wsign-compare)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wcast-align -Wimplicit-fallthrough -Wunused-parameter -Wsign-compare
+    -Wmaybe-uninitialized
+)
 
 set_target_properties(${PNG_LIBRARY}
   PROPERTIES OUTPUT_NAME ${PNG_LIBRARY}
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 2074888a52..99c1933d22 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -452,8 +452,9 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-protot
                                    -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast
                                    -Wmisleading-indentation
                                    -Wimplicit-fallthrough
+                                   -Wunused-parameter  # clang
+                                   -Warray-parameter
 )
-ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter -Wmissing-prototypes
     -Wundef  # tiffiop.h: #if __clang_major__ >= 4
 )
diff --git a/apps/traincascade/cascadeclassifier.cpp b/apps/traincascade/cascadeclassifier.cpp
index 3d7b383258..5a83746bc4 100644
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@@ -252,7 +252,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
             fs << "}";
         }
         // save current stage
-        char buf[10];
+        char buf[32];
         sprintf(buf, "%s%d", "stage", i );
         string stageFilename = dirName + buf + ".xml";
         FileStorage fs( stageFilename, FileStorage::WRITE );
diff --git a/modules/ts/src/ts_gtest.cpp b/modules/ts/src/ts_gtest.cpp
index a65ef721a2..030c56bb4f 100644
--- a/modules/ts/src/ts_gtest.cpp
+++ b/modules/ts/src/ts_gtest.cpp
@@ -8716,7 +8716,7 @@ static void StackLowerThanAddress(const void* ptr, bool* result) {
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 static bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;

From ff88132620edeb33ec80d73cbdc6bb3cd3668a61 Mon Sep 17 00:00:00 2001
From: fengyuentau <yuantao.feng@opencv.org.cn>
Date: Mon, 16 May 2022 19:01:37 +0800
Subject: [PATCH 042/178] support asymmetric paddings for qconv

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 42 ++++++++++++++++++++++++-
 modules/dnn/test/test_onnx_importer.cpp |  2 ++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 2a440a1284..133a1117ae 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -3230,14 +3230,54 @@ void ONNXImporter::parseQuantDequant(LayerParams& layerParams, const opencv_onnx
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseQConv(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+void ONNXImporter::parseQConv(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
+    opencv_onnx::NodeProto node_proto = node_proto_;
     int ninputs = node_proto.input_size();
     CV_Assert(ninputs == 8 || ninputs == 9);
 
     Mat inp_sc = getBlob(node_proto, 1);
     Mat inp_zp = getBlob(node_proto, 2);
 
+    if (layerParams.has("pad"))
+    {
+        bool asymmetricPadding = false;
+        DictValue pads = layerParams.get("pad");
+        const int dims = pads.size() / 2;
+
+        for (int i = 0; i < dims; ++i)
+        {
+            if (pads.get<int>(i) != pads.get<int>(i + dims))
+            {
+                asymmetricPadding = true;
+                break;
+            }
+        }
+        if (asymmetricPadding && pads.size() == 4)
+        {
+            layerParams.erase("pad");
+            std::vector<int> paddings(4, 0);
+            for (int i = 0; i < dims; ++i)
+            {
+                paddings.push_back(pads.get<int>(i));
+                paddings.push_back(pads.get<int>(dims + i));
+            }
+            LayerParams padLp;
+            padLp.name = layerParams.name + "/pad";
+            padLp.type = "PaddingInt8";
+            padLp.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size()));
+            padLp.set("depth", CV_8S);
+            padLp.set("value", inp_zp.at<int8_t>(0));
+
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(0));
+            proto.add_output(padLp.name);
+
+            addLayer(padLp, proto);
+            node_proto.set_input(0, padLp.name);
+        }
+    }
+
     Mat weights = getBlob(node_proto, 3);
     int outCn = weights.size[0];
     Mat w_scale = getBlob(node_proto, 4);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 8503f55c25..f222f8683d 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1752,6 +1752,8 @@ TEST_P(Test_ONNX_layers, Quantized_Convolution)
     testONNXModels("quantized_conv_uint8_weights", npy, 0.004, 0.02);
     testONNXModels("quantized_conv_int8_weights", npy, 0.03, 0.5);
     testONNXModels("quantized_conv_per_channel_weights", npy, 0.06, 0.4);
+
+    testONNXModels("quantized_conv_asymmetric_pads_int8_weights");
 }
 
 TEST_P(Test_ONNX_layers, Quantized_MatMul)

From c929f1b62f65f3d28445e5af328ca6ffc96774f4 Mon Sep 17 00:00:00 2001
From: xiong-jie-y <fwvillage@gmail.com>
Date: Sun, 15 May 2022 13:00:01 +0900
Subject: [PATCH 043/178] Add make_capture_src for video stream to Python

---
 modules/gapi/include/opencv2/gapi/streaming/cap.hpp | 6 ++++++
 modules/gapi/misc/python/samples/gaze_estimation.py | 8 ++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/modules/gapi/include/opencv2/gapi/streaming/cap.hpp b/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
index aad6af618c..73d5bfcbeb 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/cap.hpp
@@ -119,6 +119,12 @@ GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const std::string&
     return make_src<GCaptureSource>(path);
 }
 
+// NB: Overload for using from python
+GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const int id)
+{
+    return make_src<GCaptureSource>(id);
+}
+
 } // namespace wip
 } // namespace gapi
 } // namespace cv
diff --git a/modules/gapi/misc/python/samples/gaze_estimation.py b/modules/gapi/misc/python/samples/gaze_estimation.py
index 5536787e60..bdcc7851ee 100644
--- a/modules/gapi/misc/python/samples/gaze_estimation.py
+++ b/modules/gapi/misc/python/samples/gaze_estimation.py
@@ -27,7 +27,7 @@ def build_argparser():
     parser = argparse.ArgumentParser(description='This is an OpenCV-based version of Gaze Estimation example')
 
     parser.add_argument('--input',
-                        help='Path to the input video file')
+                        help='Path to the input video file or camera device number')
     parser.add_argument('--out',
                         help='Path to the output video file')
     parser.add_argument('--facem',
@@ -323,7 +323,11 @@ if __name__ == '__main__':
 
     # ------------------------Execution part------------------------
     ccomp = comp.compileStreaming(args=cv.gapi.compile_args(kernels, nets))
-    source = cv.gapi.wip.make_capture_src(ARGUMENTS.input)
+    if ARGUMENTS.input.isdigit():
+        source = cv.gapi.wip.make_capture_src(int(ARGUMENTS.input))
+    else:
+        source = cv.gapi.wip.make_capture_src(ARGUMENTS.input)
+
     ccomp.setSource(cv.gin(source))
     ccomp.start()
 

From ce859edba81a97f344a15b13c42c3331ab36df75 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 17 May 2022 14:53:04 +0000
Subject: [PATCH 044/178] GHA: fix git merge (part 2)

---
 .github/workflows/PR-3.4-W10.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index b8068cd05f..3bbd6a543b 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -44,7 +44,7 @@ jobs:
         OPENCV_EXTRA_FORK=$(git ls-remote --heads "git@github.com:/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
         if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
           echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd ${{ github.workspace }}\opencv_extra
+          cd opencv_extra
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
           git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
@@ -159,7 +159,7 @@ jobs:
         OPENCV_CONTRIB_FORK=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
         if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
           echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd ${{ github.workspace }}\opencv_contrib
+          cd opencv_contrib
           git config user.email "opencv.ci"
           git config user.name "opencv.ci"
           git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"

From 7be8a71c608d3d09473a4c13bea67833c81b2297 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <76472231+asenyaev@users.noreply.github.com>
Date: Tue, 17 May 2022 19:23:32 +0300
Subject: [PATCH 045/178] Merge pull request #21994 from
 asenyaev:asen/extra_dnn_testing

Extended DNN testing in GHA

* Extended DNN testing in GHA for 3.4 branch

* Updated docker images in Linux GitHub Actions

* Added OPENCV_DOWNLOAD_PATH flag for ARM build to use pre-downloaded binaries
---
 .github/workflows/PR-3.4-ARM64.yaml | 56 +++++++++++++++++++----------
 .github/workflows/PR-3.4-U20.yaml   | 32 +++++++++++++----
 .github/workflows/PR-3.4-W10.yaml   |  2 ++
 3 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/PR-3.4-ARM64.yaml b/.github/workflows/PR-3.4-ARM64.yaml
index affb2cb9ec..9d8a1037bd 100644
--- a/.github/workflows/PR-3.4-ARM64.yaml
+++ b/.github/workflows/PR-3.4-ARM64.yaml
@@ -6,15 +6,17 @@ on:
       - 3.4
 
 env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DENABLE_CCACHE=OFF'
-  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
-  OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
+  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DOPENCV_DOWNLOAD_PATH=/home/ci/binaries_cache -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON'
   PR_AUTHOR: ${{ github.event.pull_request.user.login }}
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
   ANT_HOME: '/usr/share/ant'
+  GIT_CACHE_DOCKER: '/home/ci/git_cache'
   PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
+  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
+  OPENCV_DOCKER_WORKDIR: '/opencv'
+  DNN_MODELS: '/home/ci/dnn-models'
 
 jobs:
   BuildAndTest:
@@ -23,7 +25,12 @@ jobs:
       run:
         shell: bash
     container:
-      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
+      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
+      volumes:
+        - /home/opencv-cn/git_cache:/home/ci/git_cache
+        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
+        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
+        - /home/opencv-cn/dnn-models:/home/ci/dnn-models
     steps:
     - name: PR info
       run: |
@@ -32,13 +39,9 @@ jobs:
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
-      run: find . -mindepth 1 -delete
+      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
     - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
+      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
     - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         cd ${{ env.OPENCV_DOCKER_WORKDIR }}
@@ -47,7 +50,7 @@ jobs:
         git config user.name "opencv.ci"
         git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
     - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
+      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv_extra.git https://github.com/opencv/opencv_extra.git /opencv_extra
     - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
       run: |
         OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
@@ -60,6 +63,22 @@ jobs:
         else
           echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
         fi
+    - name: Extra DNN models update
+      run: |
+        DOWNLOAD_MODELS_FILE='download_models.py'
+        LATEST_SAVED_HASH=$(cat ${{ env.DNN_MODELS }}/dnn/latest-hash-${{ env.TARGET_BRANCH_NAME }}.txt)
+        LATEST_HASH=$(sha256sum /opencv_extra/testdata/dnn/$DOWNLOAD_MODELS_FILE | awk '{print $1}')
+        if [[ $LATEST_HASH == $LATEST_SAVED_HASH ]]; then
+          echo "DNN models are up to date"
+          echo "OPENCV_DNN_TEST_DATA_PATH=${{ env.DNN_MODELS }}" >> $GITHUB_ENV
+        else
+          echo "Updating DNN models list"
+          echo "OPENCV_DNN_TEST_DATA_PATH=/home/ci/new-dnn-models" >> $GITHUB_ENV && OPENCV_DNN_TEST_DATA_PATH='/home/ci/new-dnn-models'
+          mkdir -p "$OPENCV_DNN_TEST_DATA_PATH"
+          rsync -a --exclude=$DOWNLOAD_MODELS_FILE ${{ env.DNN_MODELS }}/* $OPENCV_DNN_TEST_DATA_PATH
+          cp /opencv_extra/testdata/dnn/download_models.py $OPENCV_DNN_TEST_DATA_PATH/dnn
+          cd $OPENCV_DNN_TEST_DATA_PATH/dnn && python3 download_models.py
+        fi
     - name: Configure OpenCV
       run: |
         cmake -G Ninja -B /opencv-build ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
@@ -144,21 +163,22 @@ jobs:
       run:
         shell: bash
     container:
-      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
+      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
+      volumes:
+        - /home/opencv-cn/git_cache:/home/ci/git_cache
+        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
+        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
     steps:
     - name: PR info
       run: |
         echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
-      run: find . -mindepth 1 -delete
+      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
     - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
+      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
     - name: Merge opencv with a test branch
       run: |
         cd ${{ env.OPENCV_DOCKER_WORKDIR }}
diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
index e400962379..268d9dae8c 100644
--- a/.github/workflows/PR-3.4-U20.yaml
+++ b/.github/workflows/PR-3.4-U20.yaml
@@ -11,11 +11,12 @@ env:
   PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
   SOURCE_BRANCH_NAME: ${{ github.head_ref }}
   TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: /usr/share/ant
-  GIT_CACHE_DOCKER: /home/ci/git_cache
+  ANT_HOME: '/usr/share/ant'
+  GIT_CACHE_DOCKER: '/home/ci/git_cache'
   PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
-  OPENCV_TEST_DATA_PATH: /opencv_extra/testdata
-  OPENCV_DOCKER_WORKDIR: /opencv
+  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
+  OPENCV_DOCKER_WORKDIR: '/opencv'
+  DNN_MODELS: '/home/ci/dnn-models'
 
 jobs:
   BuildAndTest:
@@ -24,15 +25,17 @@ jobs:
       run:
         shell: bash
     container:
-      image: quay.io/asenyaev/opencv-ubuntu:20.04
+      image: quay.io/opencv-ci/opencv-ubuntu:20.04
       volumes:
         - /home/opencv-cn/git_cache:/home/ci/git_cache
         - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
         - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
+        - /home/opencv-cn/dnn-models:/home/ci/dnn-models
     steps:
     - name: PR info
       run: |
         echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
@@ -60,6 +63,22 @@ jobs:
         else
           echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
         fi
+    - name: Extra DNN models update
+      run: |
+        DOWNLOAD_MODELS_FILE='download_models.py'
+        LATEST_SAVED_HASH=$(cat ${{ env.DNN_MODELS }}/dnn/latest-hash-${{ env.TARGET_BRANCH_NAME }}.txt)
+        LATEST_HASH=$(sha256sum /opencv_extra/testdata/dnn/$DOWNLOAD_MODELS_FILE | awk '{print $1}')
+        if [[ $LATEST_HASH == $LATEST_SAVED_HASH ]]; then
+          echo "DNN models are up to date"
+          echo "OPENCV_DNN_TEST_DATA_PATH=${{ env.DNN_MODELS }}" >> $GITHUB_ENV
+        else
+          echo "Updating DNN models list"
+          echo "OPENCV_DNN_TEST_DATA_PATH=/home/ci/new-dnn-models" >> $GITHUB_ENV && OPENCV_DNN_TEST_DATA_PATH='/home/ci/new-dnn-models'
+          mkdir -p "$OPENCV_DNN_TEST_DATA_PATH"
+          rsync -a --exclude=$DOWNLOAD_MODELS_FILE ${{ env.DNN_MODELS }}/* $OPENCV_DNN_TEST_DATA_PATH
+          cp /opencv_extra/testdata/dnn/download_models.py $OPENCV_DNN_TEST_DATA_PATH/dnn
+          cd $OPENCV_DNN_TEST_DATA_PATH/dnn && python3 download_models.py
+        fi
     - name: Configure OpenCV
       run: |
         cd /opencv-build
@@ -147,7 +166,7 @@ jobs:
       run:
         shell: bash
     container:
-      image: quay.io/asenyaev/opencv-ubuntu:20.04
+      image: quay.io/opencv-ci/opencv-ubuntu:20.04
       volumes:
         - /home/opencv-cn/git_cache:/home/ci/git_cache
         - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
@@ -156,6 +175,7 @@ jobs:
     - name: PR info
       run: |
         echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
index b8068cd05f..2323b2df79 100644
--- a/.github/workflows/PR-3.4-W10.yaml
+++ b/.github/workflows/PR-3.4-W10.yaml
@@ -24,6 +24,7 @@ jobs:
     - name: PR info
       run: |
         echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean
@@ -139,6 +140,7 @@ jobs:
     - name: PR info
       run: |
         echo "PR Author: ${{ env.PR_AUTHOR }}"
+        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
         echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
         echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
     - name: Clean

From df2e7fa6eb65c27aab0f249f3559916624d518d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20T=C5=AFma?=
 <105645916+tumicdq@users.noreply.github.com>
Date: Thu, 19 May 2022 14:37:10 +0200
Subject: [PATCH 046/178] Merge pull request #21992 from tumicdq:v4l2

* Added support for 4B RGB V4L2 pixel formats

Added support for V4L2_PIX_FMT_XBGR32 and V4L2_PIX_FMT_ABGR32 pixel
formats.

* Added workaround for missing V4L2_PIX_FMT_ABGR32 and V4L2_PIX_FMT_XBGR32
defines
---
 modules/videoio/src/cap_v4l.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index a9f68e19a5..2fc41ce05e 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -268,6 +268,13 @@ typedef uint32_t __u32;
 #define V4L2_PIX_FMT_Y12 v4l2_fourcc('Y', '1', '2', ' ')
 #endif
 
+#ifndef V4L2_PIX_FMT_ABGR32
+#define V4L2_PIX_FMT_ABGR32  v4l2_fourcc('A', 'R', '2', '4')
+#endif
+#ifndef V4L2_PIX_FMT_XBGR32
+#define V4L2_PIX_FMT_XBGR32  v4l2_fourcc('X', 'R', '2', '4')
+#endif
+
 /* Defaults - If your board can do better, set it here.  Set for the most common type inputs. */
 #define DEFAULT_V4L_WIDTH  640
 #define DEFAULT_V4L_HEIGHT 480
@@ -564,6 +571,8 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
             V4L2_PIX_FMT_NV21,
             V4L2_PIX_FMT_SBGGR8,
             V4L2_PIX_FMT_SGBRG8,
+            V4L2_PIX_FMT_XBGR32,
+            V4L2_PIX_FMT_ABGR32,
             V4L2_PIX_FMT_SN9C10X,
 #ifdef HAVE_JPEG
             V4L2_PIX_FMT_MJPEG,
@@ -632,6 +641,8 @@ bool CvCaptureCAM_V4L::convertableToRgb() const
     case V4L2_PIX_FMT_Y10:
     case V4L2_PIX_FMT_GREY:
     case V4L2_PIX_FMT_BGR24:
+    case V4L2_PIX_FMT_XBGR32:
+    case V4L2_PIX_FMT_ABGR32:
         return true;
     default:
         break;
@@ -651,6 +662,8 @@ void CvCaptureCAM_V4L::v4l2_create_frame()
         switch (palette) {
         case V4L2_PIX_FMT_BGR24:
         case V4L2_PIX_FMT_RGB24:
+        case V4L2_PIX_FMT_XBGR32:
+        case V4L2_PIX_FMT_ABGR32:
             break;
         case V4L2_PIX_FMT_YUYV:
         case V4L2_PIX_FMT_UYVY:
@@ -1613,6 +1626,10 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer &currentBuffer)
     case V4L2_PIX_FMT_GREY:
         cv::cvtColor(cv::Mat(imageSize, CV_8UC1, currentBuffer.start), destination, COLOR_GRAY2BGR);
         break;
+    case V4L2_PIX_FMT_XBGR32:
+    case V4L2_PIX_FMT_ABGR32:
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC4, currentBuffer.start), destination, COLOR_BGRA2BGR);
+        break;
     case V4L2_PIX_FMT_BGR24:
     default:
         memcpy((char *)frame.imageData, (char *)currentBuffer.start,

From aaf7f5ae02468795e46b1dd90ad6caf64bbb968f Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Fri, 20 May 2022 14:29:43 +0300
Subject: [PATCH 047/178] Move workflows to a dedicated repository

---
 .github/workflows/PR-3.4-ARM64.yaml | 209 ---------------------------
 .github/workflows/PR-3.4-U20.yaml   | 213 ----------------------------
 .github/workflows/PR-3.4-W10.yaml   | 180 -----------------------
 .github/workflows/PR-3.4.yaml       |  16 +++
 4 files changed, 16 insertions(+), 602 deletions(-)
 delete mode 100644 .github/workflows/PR-3.4-ARM64.yaml
 delete mode 100644 .github/workflows/PR-3.4-U20.yaml
 delete mode 100644 .github/workflows/PR-3.4-W10.yaml
 create mode 100644 .github/workflows/PR-3.4.yaml

diff --git a/.github/workflows/PR-3.4-ARM64.yaml b/.github/workflows/PR-3.4-ARM64.yaml
deleted file mode 100644
index 9d8a1037bd..0000000000
--- a/.github/workflows/PR-3.4-ARM64.yaml
+++ /dev/null
@@ -1,209 +0,0 @@
-name: PR:3.4 ARM64
-
-on:
-  pull_request:
-    branches:
-      - 3.4
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DOPENCV_DOWNLOAD_PATH=/home/ci/binaries_cache -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON'
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: '/usr/share/ant'
-  GIT_CACHE_DOCKER: '/home/ci/git_cache'
-  PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
-  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
-  OPENCV_DOCKER_WORKDIR: '/opencv'
-  DNN_MODELS: '/home/ci/dnn-models'
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-lin-arm64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-        - /home/opencv-cn/dnn-models:/home/ci/dnn-models
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv_extra.git https://github.com/opencv/opencv_extra.git /opencv_extra
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Extra DNN models update
-      run: |
-        DOWNLOAD_MODELS_FILE='download_models.py'
-        LATEST_SAVED_HASH=$(cat ${{ env.DNN_MODELS }}/dnn/latest-hash-${{ env.TARGET_BRANCH_NAME }}.txt)
-        LATEST_HASH=$(sha256sum /opencv_extra/testdata/dnn/$DOWNLOAD_MODELS_FILE | awk '{print $1}')
-        if [[ $LATEST_HASH == $LATEST_SAVED_HASH ]]; then
-          echo "DNN models are up to date"
-          echo "OPENCV_DNN_TEST_DATA_PATH=${{ env.DNN_MODELS }}" >> $GITHUB_ENV
-        else
-          echo "Updating DNN models list"
-          echo "OPENCV_DNN_TEST_DATA_PATH=/home/ci/new-dnn-models" >> $GITHUB_ENV && OPENCV_DNN_TEST_DATA_PATH='/home/ci/new-dnn-models'
-          mkdir -p "$OPENCV_DNN_TEST_DATA_PATH"
-          rsync -a --exclude=$DOWNLOAD_MODELS_FILE ${{ env.DNN_MODELS }}/* $OPENCV_DNN_TEST_DATA_PATH
-          cp /opencv_extra/testdata/dnn/download_models.py $OPENCV_DNN_TEST_DATA_PATH/dnn
-          cd $OPENCV_DNN_TEST_DATA_PATH/dnn && python3 download_models.py
-        fi
-    - name: Configure OpenCV
-      run: |
-        cmake -G Ninja -B /opencv-build ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV
-      run: |
-        cd /opencv-build
-        ninja
-    - name: Accuracy:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_calib3d
-    - name: Accuracy:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_core
-    - name: Accuracy:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_dnn
-    - name: Accuracy:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_features2d
-    - name: Accuracy:flann
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_flann
-    - name: Accuracy:highgui
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_highgui
-    - name: Accuracy:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgcodecs
-    - name: Accuracy:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgproc
-    - name: Accuracy:ml
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_ml
-    - name: Accuracy:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect --gtest_filter="-Objdetect_QRCode_Close.regression/0:Objdetect_QRCode_Close.regression/4"
-    - name: Accuracy:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo --gtest_filter="-Photo_CalibrateDebevec.regression"
-    - name: Accuracy:shape
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_shape
-    - name: Accuracy:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
-    - name: Accuracy:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_superres
-    - name: Accuracy:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
-    - name: Accuracy:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
-    - name: Accuracy:videostab
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videostab
-    - name: Performance:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_core --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_dnn --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_features2d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgcodecs --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgproc --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_objdetect --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter="-Perf_Objdetect_QRCode.detect/2:Perf_Objdetect_QRCode_Multi.decodeMulti*:Perf_Objdetect_QRCode_Multi.detectMulti*"
-    - name: Performance:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_superres --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_videoio --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Python3
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/python/test
-        python3 ./test.py --repo ../../../ -v
-    - name: Java
-      run: cd /opencv-build && xvfb-run -a python3 ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/ts/misc/run.py . -a -t java
-    - name: Save Unit Test Results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: junit-html
-        path: /opencv-build/java_test/testResults/junit-noframes.html
-
-  BuildContrib:
-    runs-on: opencv-cn-lin-arm64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with a test branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_contrib
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        cmake -G Ninja -B /opencv-contrib-build ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        ninja
diff --git a/.github/workflows/PR-3.4-U20.yaml b/.github/workflows/PR-3.4-U20.yaml
deleted file mode 100644
index 268d9dae8c..0000000000
--- a/.github/workflows/PR-3.4-U20.yaml
+++ /dev/null
@@ -1,213 +0,0 @@
-name: PR:3.4 U20
-
-on:
-  pull_request:
-    branches:
-      - 3.4
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DOPENCV_DOWNLOAD_PATH=/home/ci/binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON'
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: '/usr/share/ant'
-  GIT_CACHE_DOCKER: '/home/ci/git_cache'
-  PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
-  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
-  OPENCV_DOCKER_WORKDIR: '/opencv'
-  DNN_MODELS: '/home/ci/dnn-models'
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-lin-x86-64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/opencv-ci/opencv-ubuntu:20.04
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-        - /home/opencv-cn/dnn-models:/home/ci/dnn-models
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv_extra.git https://github.com/opencv/opencv_extra.git /opencv_extra
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Extra DNN models update
-      run: |
-        DOWNLOAD_MODELS_FILE='download_models.py'
-        LATEST_SAVED_HASH=$(cat ${{ env.DNN_MODELS }}/dnn/latest-hash-${{ env.TARGET_BRANCH_NAME }}.txt)
-        LATEST_HASH=$(sha256sum /opencv_extra/testdata/dnn/$DOWNLOAD_MODELS_FILE | awk '{print $1}')
-        if [[ $LATEST_HASH == $LATEST_SAVED_HASH ]]; then
-          echo "DNN models are up to date"
-          echo "OPENCV_DNN_TEST_DATA_PATH=${{ env.DNN_MODELS }}" >> $GITHUB_ENV
-        else
-          echo "Updating DNN models list"
-          echo "OPENCV_DNN_TEST_DATA_PATH=/home/ci/new-dnn-models" >> $GITHUB_ENV && OPENCV_DNN_TEST_DATA_PATH='/home/ci/new-dnn-models'
-          mkdir -p "$OPENCV_DNN_TEST_DATA_PATH"
-          rsync -a --exclude=$DOWNLOAD_MODELS_FILE ${{ env.DNN_MODELS }}/* $OPENCV_DNN_TEST_DATA_PATH
-          cp /opencv_extra/testdata/dnn/download_models.py $OPENCV_DNN_TEST_DATA_PATH/dnn
-          cd $OPENCV_DNN_TEST_DATA_PATH/dnn && python3 download_models.py
-        fi
-    - name: Configure OpenCV
-      run: |
-        cd /opencv-build
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV
-      run: |
-        cd /opencv-build
-        ninja
-    - name: Accuracy:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_calib3d
-    - name: Accuracy:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_core
-    - name: Accuracy:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_dnn
-    - name: Accuracy:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_features2d
-    - name: Accuracy:flann
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_flann
-    - name: Accuracy:highgui
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_highgui
-    - name: Accuracy:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgcodecs
-    - name: Accuracy:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgproc
-    - name: Accuracy:ml
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_ml
-    - name: Accuracy:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect
-    - name: Accuracy:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo
-    - name: Accuracy:shape
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_shape
-    - name: Accuracy:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
-    - name: Accuracy:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_superres
-    - name: Accuracy:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
-    - name: Accuracy:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
-    - name: Accuracy:videostab
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videostab
-    - name: Performance:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_core --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_dnn --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_features2d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgcodecs --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgproc --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_objdetect --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:superres
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_superres --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_videoio --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Python3
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/python/test
-        python3 ./test.py --repo ../../../ -v
-    - name: Java
-      run: cd /opencv-build && xvfb-run -a python3 ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/ts/misc/run.py . -a -t java
-    - name: Save Unit Test Results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: junit-html
-        path: /opencv-build/java_test/testResults/junit-noframes.html
-    - name: Pylint
-      run: cd /opencv-build && cmake --build . --config release --target check_pylint -- -j4
-
-  BuildContrib:
-    runs-on: opencv-cn-lin-x86-64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/opencv-ci/opencv-ubuntu:20.04
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with a test branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_contrib
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        ninja
diff --git a/.github/workflows/PR-3.4-W10.yaml b/.github/workflows/PR-3.4-W10.yaml
deleted file mode 100644
index 4aad117d0f..0000000000
--- a/.github/workflows/PR-3.4-W10.yaml
+++ /dev/null
@@ -1,180 +0,0 @@
-name: PR:3.4 W10
-
-on:
-  pull_request:
-    branches:
-      - 3.4
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=%BINARIES_CACHE% -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
-  OPENCV_TEST_DATA_PATH: ${{ github.workspace }}\opencv_extra\testdata
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  GTEST_FILTER_STRING: '-Samples.findFile'
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-win
-    defaults:
-      run:
-        shell: cmd
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: cd ${{ github.workspace }} && rm -rf *
-    - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ github.workspace }}\opencv
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Fetch opencv_extra
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_extra.git git@github.com:opencv/opencv_extra.git
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      shell: bash
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "git@github.com:/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV
-      run: |
-        mkdir ${{ github.workspace }}\opencv-build && cd ${{ github.workspace }}\opencv-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ github.workspace }}\opencv
-    - name: Build OpenCV
-      run: |
-        cd ${{ github.workspace }}\opencv-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        ninja
-    - name: Accuracy:calib3d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_calib3d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:core
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_core.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:dnn
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_dnn.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:features2d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_features2d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:flann
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_flann.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:highgui
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_highgui.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:imgcodecs
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgcodecs.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:imgproc
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgproc.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:ml
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_ml.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:objdetect
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_objdetect.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:photo
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_photo.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:shape
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_shape.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:stitching
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_stitching.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:superres
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_superres.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:video
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_video.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:videoio
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videoio.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:videostab
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videostab.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:calib3d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_calib3d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:core
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_core.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:dnn
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_dnn.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:features2d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_features2d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:imgcodecs
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgcodecs.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:imgproc
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgproc.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:objdetect
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_objdetect.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:photo
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_photo.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:stitching
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_stitching.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:superres
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_superres.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:video
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_video.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:videoio
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_videoio.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Python3
-      run: |
-        cd ${{ github.workspace }}\opencv\modules\python\test
-        set PYTHONPATH=%PYTHONPATH%;${{ github.workspace }}\opencv-build\python_loader;${{ github.workspace }}\opencv-build\lib\python3
-        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin;${{ github.workspace }}\opencv-build\lib\python3
-        python test.py --repo ..\..\..\ -v
-    - name: Java
-      run: |
-        cd ${{ github.workspace }}\opencv-build
-        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin
-        ${{ github.workspace }}\opencv\modules\ts\misc\run.py . -a -t java
-  BuildContrib:
-    runs-on: opencv-cn-win
-    defaults:
-      run:
-        shell: cmd
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: cd ${{ github.workspace }} && rm -rf *
-    - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ github.workspace }}\opencv
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Fetch opencv_contrib
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      shell: bash
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        mkdir ${{ github.workspace }}\opencv-contrib-build && cd ${{ github.workspace }}\opencv-contrib-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=${{ github.workspace }}\opencv_contrib\modules ${{ github.workspace }}\opencv
-    - name: Build OpenCV Contrib
-      run: |
-        cd ${{ github.workspace }}\opencv-contrib-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        ninja
diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
new file mode 100644
index 0000000000..a1efca8c26
--- /dev/null
+++ b/.github/workflows/PR-3.4.yaml
@@ -0,0 +1,16 @@
+name: PR:3.4
+
+on:
+  pull_request:
+    branches:
+      - 3.4
+
+jobs:
+  ARM64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-ARM64.yaml@main
+
+  U20:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-U20.yaml@main
+
+  W10:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-W10.yaml@main
\ No newline at end of file

From 3d207ccf11196f0ee349bb48f318be1f3ca53517 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Fri, 20 May 2022 19:46:56 +0300
Subject: [PATCH 048/178] Move workflows to a dedicated repository for 4.x
 branch

---
 .github/workflows/PR-4.x-ARM64.yaml       | 181 ---------------------
 .github/workflows/PR-4.x-U20.yaml         | 189 ----------------------
 .github/workflows/PR-4.x-W10.yaml         | 176 --------------------
 .github/workflows/PR-4.x.yaml             |  19 +++
 .github/workflows/timvx_backend_tests.yml | 101 ------------
 5 files changed, 19 insertions(+), 647 deletions(-)
 delete mode 100644 .github/workflows/PR-4.x-ARM64.yaml
 delete mode 100644 .github/workflows/PR-4.x-U20.yaml
 delete mode 100644 .github/workflows/PR-4.x-W10.yaml
 create mode 100644 .github/workflows/PR-4.x.yaml
 delete mode 100644 .github/workflows/timvx_backend_tests.yml

diff --git a/.github/workflows/PR-4.x-ARM64.yaml b/.github/workflows/PR-4.x-ARM64.yaml
deleted file mode 100644
index 72af61f412..0000000000
--- a/.github/workflows/PR-4.x-ARM64.yaml
+++ /dev/null
@@ -1,181 +0,0 @@
-name: PR:4.x ARM64
-
-on:
-  pull_request:
-    branches:
-      - 4.x
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DBUILD_opencv_xfeatures2d=OFF -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DENABLE_CCACHE=OFF'
-  OPENCV_TEST_DATA_PATH: '/opencv_extra/testdata'
-  OPENCV_DOCKER_WORKDIR: '/__w/opencv/opencv'
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: '/usr/share/ant'
-  PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-lin-arm64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/opencv-ci/opencv-ubuntu:20.04-arm64
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "PR Author fork: ${{ env.PR_AUTHOR_FORK }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find . -mindepth 1 -delete
-    - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_extra.git /opencv_extra
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV
-      run: |
-        cmake -G Ninja -B /opencv-build ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV
-      run: |
-        cd /opencv-build
-        ninja
-    - name: Accuracy:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_calib3d
-    - name: Accuracy:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_core
-    - name: Accuracy:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_dnn
-    - name: Accuracy:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_features2d
-    - name: Accuracy:flann
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_flann
-    - name: Accuracy:highgui
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_highgui
-    - name: Accuracy:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgcodecs
-    - name: Accuracy:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgproc
-    - name: Accuracy:ml
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_ml
-    - name: Accuracy:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect --gtest_filter="-Objdetect_QRCode_Close.regression/0:Objdetect_QRCode_Close.regression/4"
-    - name: Accuracy:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo --gtest_filter="-Photo_CalibrateDebevec.regression"
-    - name: Accuracy:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
-    - name: Accuracy:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
-    - name: Accuracy:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
-    - name: Performance:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_core --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_dnn --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_features2d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgcodecs --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgproc --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_objdetect --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter="-Perf_Objdetect_QRCode.detect/2:Perf_Objdetect_QRCode_Multi.decodeMulti*:Perf_Objdetect_QRCode_Multi.detectMulti*"
-    - name: Performance:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_videoio --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Python3
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/python/test
-        python3 ./test.py --repo ../../../ -v
-    - name: Java
-      run: cd /opencv-build && xvfb-run -a python3 ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/ts/misc/run.py . -a -t java
-    - name: Save Unit Test Results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: junit-html
-        path: /opencv-build/java_test/testResults/junit-noframes.html
-
-  BuildContrib:
-    runs-on: opencv-cn-lin-arm64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: docker.io/yuentau/ocv_ubuntu:20.04-arm64
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find . -mindepth 1 -delete
-    - name: Fetch opencv
-      uses: actions/checkout@v3
-      with:
-        repository: opencv/opencv
-        ref: ${{ env.TARGET_BRANCH_NAME }}
-        fetch-depth: 0
-    - name: Merge opencv with a test branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_contrib
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        cmake -G Ninja -B /opencv-contrib-build ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        ninja
diff --git a/.github/workflows/PR-4.x-U20.yaml b/.github/workflows/PR-4.x-U20.yaml
deleted file mode 100644
index 10b0847bab..0000000000
--- a/.github/workflows/PR-4.x-U20.yaml
+++ /dev/null
@@ -1,189 +0,0 @@
-name: PR:4.x U20
-
-on:
-  pull_request:
-    branches:
-      - 4.x
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DBUILD_DOCS=ON -DPYTHON_DEFAULT_EXECUTABLE=/usr/bin/python3 -DOPENCV_DOWNLOAD_PATH=/home/ci/binaries_cache -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON'
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  ANT_HOME: /usr/share/ant
-  GIT_CACHE_DOCKER: /home/ci/git_cache
-  PYTHONPATH: /opencv-build/python_loader:$PYTHONPATH
-  OPENCV_TEST_DATA_PATH: /opencv_extra/testdata
-  OPENCV_DOCKER_WORKDIR: /opencv
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-lin-x86-64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/asenyaev/opencv-ubuntu:20.04
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_extra
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv_extra.git https://github.com/opencv/opencv_extra.git /opencv_extra
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV
-      run: |
-        cd /opencv-build
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV
-      run: |
-        cd /opencv-build
-        ninja
-    - name: Accuracy:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_calib3d
-    - name: Accuracy:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_core
-    - name: Accuracy:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_dnn
-    - name: Accuracy:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_features2d
-    - name: Accuracy:flann
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_flann
-    - name: Accuracy:gapi
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_gapi
-    - name: Accuracy:highgui
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_highgui
-    - name: Accuracy:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgcodecs
-    - name: Accuracy:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_imgproc
-    - name: Accuracy:ml
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_ml
-    - name: Accuracy:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_objdetect
-    - name: Accuracy:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_photo
-    - name: Accuracy:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_stitching
-    - name: Accuracy:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_video
-    - name: Accuracy:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_test_videoio
-    - name: Performance:calib3d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_calib3d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:core
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_core --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:dnn
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_dnn --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:features2d
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_features2d --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:gapi
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_gapi --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgcodecs
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgcodecs --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:imgproc
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_imgproc --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:objdetect
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_objdetect --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:photo
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_photo --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:stitching
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_stitching --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:video
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_video --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Performance:videoio
-      run: cd /opencv-build && xvfb-run -a bin/opencv_perf_videoio --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1
-    - name: Python3
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/python/test
-        python3 ./test.py --repo ../../../ -v
-    - name: Java
-      run: cd /opencv-build && xvfb-run -a python3 ${{ env.OPENCV_DOCKER_WORKDIR }}/modules/ts/misc/run.py . -a -t java
-    - name: Save Unit Test Results
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: junit-html
-        path: /opencv-build/java_test/testResults/junit-noframes.html
-    - name: Pylint
-      run: cd /opencv-build && cmake --build . --config release --target check_pylint -- -j4
-
-  BuildContrib:
-    runs-on: opencv-cn-lin-x86-64
-    defaults:
-      run:
-        shell: bash
-    container:
-      image: quay.io/asenyaev/opencv-ubuntu:20.04
-      volumes:
-        - /home/opencv-cn/git_cache:/home/ci/git_cache
-        - /home/opencv-cn/ci_cache/opencv:/home/ci/.ccache
-        - /home/opencv-cn/binaries_cache:/home/ci/binaries_cache
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: find ${{ env.OPENCV_DOCKER_WORKDIR }} -mindepth 1 -delete
-    - name: Fetch opencv
-      run: git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference ${{ env.GIT_CACHE_DOCKER }}/opencv.git https://github.com/opencv/opencv.git ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Merge opencv with a test branch
-      run: |
-        cd ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config --global --add safe.directory ${{ env.OPENCV_DOCKER_WORKDIR }}
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Clone opencv_contrib
-      run: git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} https://github.com/opencv/opencv_contrib.git /opencv_contrib
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules ${{ env.OPENCV_DOCKER_WORKDIR }}
-    - name: Build OpenCV Contrib
-      run: |
-        cd /opencv-contrib-build
-        ninja
diff --git a/.github/workflows/PR-4.x-W10.yaml b/.github/workflows/PR-4.x-W10.yaml
deleted file mode 100644
index 8ebd8c782e..0000000000
--- a/.github/workflows/PR-4.x-W10.yaml
+++ /dev/null
@@ -1,176 +0,0 @@
-name: PR:4.x W10
-
-on:
-  pull_request:
-    branches:
-      - 4.x
-
-env:
-  EXTRA_CMAKE_OPTIONS: '-DCL_Z_OPTION=/Z7 -DOPENCV_DOWNLOAD_PATH=%BINARIES_CACHE% -DBUILD_EXAMPLES=ON -DOPENCV_ENABLE_NONFREE=ON -DCMAKE_BUILD_TYPE=Release'
-  OPENCV_TEST_DATA_PATH: ${{ github.workspace }}\opencv_extra\testdata
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-  GTEST_FILTER_STRING: '-Samples.findFile:videoio/videocapture_acceleration.read/122:videoio/videocapture_acceleration.read/126'
-
-jobs:
-  BuildAndTest:
-    runs-on: opencv-cn-win
-    defaults:
-      run:
-        shell: cmd
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: cd ${{ github.workspace }} && rm -rf *
-    - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ github.workspace }}\opencv
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Fetch opencv_extra
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_extra.git git@github.com:opencv/opencv_extra.git
-    - name: Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch
-      shell: bash
-      run: |
-        OPENCV_EXTRA_FORK=$(git ls-remote --heads "git@github.com:/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_EXTRA_FORK" ]]; then
-          echo "Merge opencv_extra with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd opencv_extra
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV
-      run: |
-        mkdir ${{ github.workspace }}\opencv-build && cd ${{ github.workspace }}\opencv-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} ${{ github.workspace }}\opencv
-    - name: Build OpenCV
-      run: |
-        cd ${{ github.workspace }}\opencv-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        ninja
-    - name: Accuracy:calib3d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_calib3d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:core
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_core.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:dnn
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_dnn.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:features2d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_features2d.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:flann
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_flann.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:gapi
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_gapi.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:highgui
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_highgui.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:imgcodecs
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgcodecs.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:imgproc
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_imgproc.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:ml
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_ml.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:objdetect
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_objdetect.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:photo
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_photo.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:stitching
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_stitching.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:video
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_video.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Accuracy:videoio
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_test_videoio.exe --skip_unstable --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:calib3d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_calib3d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:core
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_core.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:dnn
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_dnn.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:features2d
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_features2d.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:gapi
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_gapi.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:imgcodecs
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgcodecs.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:imgproc
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_imgproc.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:objdetect
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_objdetect.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:photo
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_photo.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:stitching
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_stitching.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:video
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_video.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Performance:videoio
-      run: cd ${{ github.workspace }}\opencv-build && bin\opencv_perf_videoio.exe --perf_impl=plain --perf_min_samples=1 --perf_force_samples=1 --perf_verify_sanity --skip_unstable=1 --gtest_filter=${{ env.GTEST_FILTER_STRING }}
-    - name: Python3
-      run: |
-        cd ${{ github.workspace }}\opencv\modules\python\test
-        set PYTHONPATH=%PYTHONPATH%;${{ github.workspace }}\opencv-build\python_loader;${{ github.workspace }}\opencv-build\lib\python3
-        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin;${{ github.workspace }}\opencv-build\lib\python3
-        python test.py --repo ..\..\..\ -v
-    - name: Java
-      run: |
-        cd ${{ github.workspace }}\opencv-build
-        set PATH=%PATH%;${{ github.workspace }}\opencv-build\bin
-        ${{ github.workspace }}\opencv\modules\ts\misc\run.py . -a -t java
-
-
-  BuildContrib:
-    runs-on: opencv-cn-win
-    defaults:
-      run:
-        shell: cmd
-    steps:
-    - name: PR info
-      run: |
-        echo "PR Author: ${{ env.PR_AUTHOR }}"
-        echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-        echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-    - name: Clean
-      run: cd ${{ github.workspace }} && rm -rf *
-    - name: Fetch opencv
-      run: cd ${{ github.workspace }} && git clone --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv.git git@github.com:opencv/opencv.git
-    - name: Merge opencv with ${{ env.SOURCE_BRANCH_NAME }} branch
-      run: |
-        cd ${{ github.workspace }}\opencv
-        git config user.email "opencv.ci"
-        git config user.name "opencv.ci"
-        git pull -v "git@github.com:${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-    - name: Fetch opencv_contrib
-      run: cd ${{ github.workspace }} && git clone --single-branch --branch ${{ env.TARGET_BRANCH_NAME }} --reference %GIT_CACHE%\opencv_contrib.git --depth 1 git@github.com:opencv/opencv_contrib.git
-    - name: Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch
-      shell: bash
-      run: |
-        OPENCV_CONTRIB_FORK=$(git ls-remote --heads "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}") || true
-        if [[ ! -z "$OPENCV_CONTRIB_FORK" ]]; then
-          echo "Merge opencv_contrib with ${{ env.SOURCE_BRANCH_NAME }} branch"
-          cd /opencv_contrib
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "git@github.com:${{ env.PR_AUTHOR }}/opencv_contrib" "${{ env.SOURCE_BRANCH_NAME }}"
-        else
-          echo "No merge since ${{ env.PR_AUTHOR }}/opencv_contrib does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-        fi
-    - name: Configure OpenCV Contrib
-      run: |
-        mkdir ${{ github.workspace }}\opencv-contrib-build && cd ${{ github.workspace }}\opencv-contrib-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        cmake -G Ninja ${{ env.EXTRA_CMAKE_OPTIONS }} -DOPENCV_EXTRA_MODULES_PATH=${{ github.workspace }}\opencv_contrib\modules ${{ github.workspace }}\opencv
-    - name: Build OpenCV Contrib
-      run: |
-        cd ${{ github.workspace }}\opencv-contrib-build
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
-        ninja
diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
new file mode 100644
index 0000000000..980bf753de
--- /dev/null
+++ b/.github/workflows/PR-4.x.yaml
@@ -0,0 +1,19 @@
+name: PR:4.x
+
+on:
+  pull_request:
+    branches:
+      - 4.x
+
+jobs:
+  ARM64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-ARM64.yaml@main
+
+  U20:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20.yaml@main
+
+  W10:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
+
+  TIM-VX:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main
\ No newline at end of file
diff --git a/.github/workflows/timvx_backend_tests.yml b/.github/workflows/timvx_backend_tests.yml
deleted file mode 100644
index 5007a32755..0000000000
--- a/.github/workflows/timvx_backend_tests.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-name: TIM-VX Backend
-
-on:
-  pull_request:
-    branches: [ 4.x ]
-    types: [ labeled, opened, synchronize, reopened ]
-
-env:
-  PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-  PR_AUTHOR_FORK: ${{ github.event.pull_request.head.repo.full_name }}
-  SOURCE_BRANCH_NAME: ${{ github.head_ref }}
-  TARGET_BRANCH_NAME: ${{ github.base_ref }}
-
-jobs:
-  x86-simulator-build-test:
-    runs-on: ubuntu-20.04
-    # Docker image from https://hub.docker.com/r/yuentau/ocv_ubuntu
-    container: docker.io/yuentau/ocv_ubuntu:20.04
-    steps:
-      - name: info
-        run: |
-          echo "PR Author: ${{ env.PR_AUTHOR }}"
-          echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-          echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-      - name: clean
-        shell: bash
-        run: find . -mindepth 1 -delete
-      - name: fetch opencv
-        uses: actions/checkout@v3
-        with:
-          repository: opencv/opencv
-          ref: ${{ env.TARGET_BRANCH_NAME }}
-          fetch-depth: 0
-          path: opencv
-      - name: merge opencv with test branch
-        shell: bash
-        run: |
-          cd opencv
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-      - name: configure
-        run: |
-          cmake -B build -DWITH_TIMVX=ON -DCMAKE_INSTALL_PREFIX=./install -DBUILD_SHARED_LIBS=ON -DBUILD_PERF_TESTS=ON -DBUILD_TESTS=ON -DBUILD_EXAMPLES=OFF -DBUILD_DOCS=OFF -DWITH_OPENCL=OFF opencv
-      - name: build
-        run: cmake --build build --target install -j $(nproc)
-
-  khadas-vim3-tests:
-    if: contains(github.event.pull_request.labels.*.name, 'category:dnn_timvx')
-    concurrency:
-      group: khadas-vim3
-      cancel-in-progress: false
-    runs-on: [self-hosted, Linux, ARM64, khadas-vim3]
-    steps:
-      - name: info
-        run: |
-          echo "PR Author: ${{ env.PR_AUTHOR }}"
-          echo "Source branch name: ${{ env.SOURCE_BRANCH_NAME }}"
-          echo "Target branch name: ${{ env.TARGET_BRANCH_NAME }}"
-      - name: clean
-        shell: bash
-        run: find . -mindepth 1 -delete
-      - name: fetch opencv
-        uses: actions/checkout@v3
-        with:
-          repository: opencv/opencv
-          ref: ${{ env.TARGET_BRANCH_NAME }}
-          fetch-depth: 0
-          path: opencv
-      - name: merge opencv with test branch
-        shell: bash
-        run: |
-          cd opencv
-          git config user.email "opencv.ci"
-          git config user.name "opencv.ci"
-          git pull -v "https://github.com/${{ env.PR_AUTHOR_FORK }}" "${{ env.SOURCE_BRANCH_NAME }}"
-      - name: fetch opencv_extra
-        uses: actions/checkout@v3
-        with:
-          repository: opencv/opencv_extra
-          path: opencv_extra
-      - name: merge opencv_extra with test branch
-        shell: bash
-        run: |
-          RET=$(git ls-remote --heads "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}") || true
-          if [[ ! -z "$RET" ]]; then
-            cd opencv_extra
-            git config user.email "opencv.ci"
-            git config user.name "opencv.ci"
-            git pull -v "https://github.com/${{ env.PR_AUTHOR }}/opencv_extra" "${{ env.SOURCE_BRANCH_NAME }}"
-          else
-            echo "no merge since ${{ env.PR_AUTHOR }}/opencv_extra does not have branch ${{ env.SOURCE_BRANCH_NAME }}"
-          fi
-      - name: configure
-        run: |
-          cmake -B build -D CMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=./install -DWITH_TIMVX=ON -DWITH_OPENCL=OFF -DWITH_EIGEN=OFF opencv
-      - name: build
-        run: cmake --build build --target opencv_test_dnn -j 4
-      - name: unit tests for int8 layers
-        run: |
-          OPENCV_TEST_DATA_PATH=./opencv_extra/testdata ./build/bin/opencv_test_dnn --gtest_filter="Test_Int8_layers.*/1"

From 6a22c5b2b59bdd325d5759cebbf6e67e33c3a8b9 Mon Sep 17 00:00:00 2001
From: Michael Gruner <michael.gruner@ridgerun.com>
Date: Sun, 22 May 2022 20:08:38 -0600
Subject: [PATCH 049/178] Consider video meta on GStreamer video capture

Some GStreamer elements may produce buffers with very non
standard strides, offsets and/or even transport each plane
in different, non-contiguous pointers. This non-standard
layout is communicated via GstVideoMeta structures attached
to the buffers. Given this, when a GstVideoMeta is available,
one should parse the layout from it instead of generating
a generic one from the caps.

The GstVideoFrame utility does precisely this: if the buffer
contains a video meta, it uses that to fill the format and
memory layout. If there is no meta available, the layout is
inferred from the caps.
---
 modules/videoio/src/cap_gstreamer.cpp | 135 ++++++++++++++------------
 1 file changed, 71 insertions(+), 64 deletions(-)

diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index cdf8d47249..a871b98650 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -192,6 +192,19 @@ public:
     }
 };
 
+class ScopeGuardGstVideoFrame
+{
+    GstVideoFrame* frame_;
+public:
+    ScopeGuardGstVideoFrame(GstVideoFrame* frame)
+        : frame_(frame)
+    {}
+    ~ScopeGuardGstVideoFrame()
+    {
+        gst_video_frame_unmap(frame_);
+    }
+};
+
 } // namespace
 
 /*!
@@ -651,8 +664,28 @@ bool GStreamerCapture::retrieveVideoFrame(int, OutputArray dst)
         CV_Error(Error::StsError, "GStreamer: gst_video_info_from_caps() is failed. Can't handle unknown layout");
     }
 
-    int frame_width = GST_VIDEO_INFO_WIDTH(&info);
-    int frame_height = GST_VIDEO_INFO_HEIGHT(&info);
+    // gstreamer expects us to handle the memory at this point
+    // so we can just wrap the raw buffer and be done with it
+    GstBuffer* buf = gst_sample_get_buffer(sample);  // no lifetime transfer
+    if (!buf)
+        return false;
+
+    // at this point, the gstreamer buffer may contain a video meta with special
+    // stride and plane locations. We __must__ consider in order to correctly parse
+    // the data. The gst_video_frame_map will parse the meta for us, or default to
+    // regular strides/offsets if no meta is present.
+    GstVideoFrame frame = {};
+    GstMapFlags flags = static_cast<GstMapFlags>(GST_MAP_READ | GST_VIDEO_FRAME_MAP_FLAG_NO_REF);
+    if (!gst_video_frame_map(&frame, &info, buf, flags))
+    {
+        CV_LOG_ERROR(NULL, "GStreamer: Failed to map GStreamer buffer to system memory");
+        return false;
+    }
+
+    ScopeGuardGstVideoFrame frame_guard(&frame);  // call gst_video_frame_unmap(&frame) on scope leave
+
+    int frame_width = GST_VIDEO_FRAME_COMP_WIDTH(&frame, 0);
+    int frame_height = GST_VIDEO_FRAME_COMP_HEIGHT(&frame, 0);
     if (frame_width <= 0 || frame_height <= 0)
     {
         CV_LOG_ERROR(NULL, "GStreamer: Can't query frame size from GStreamer sample");
@@ -674,19 +707,6 @@ bool GStreamerCapture::retrieveVideoFrame(int, OutputArray dst)
     }
     std::string name = toLowerCase(std::string(name_));
 
-    // gstreamer expects us to handle the memory at this point
-    // so we can just wrap the raw buffer and be done with it
-    GstBuffer* buf = gst_sample_get_buffer(sample);  // no lifetime transfer
-    if (!buf)
-        return false;
-    GstMapInfo map_info = {};
-    if (!gst_buffer_map(buf, &map_info, GST_MAP_READ))
-    {
-        CV_LOG_ERROR(NULL, "GStreamer: Failed to map GStreamer buffer to system memory");
-        return false;
-    }
-    ScopeGuardGstMapInfo map_guard(buf, &map_info);  // call gst_buffer_unmap(buf, &map_info) on scope leave
-
     // we support these types of data:
     //     video/x-raw, format=BGR   -> 8bit, 3 channels
     //     video/x-raw, format=GRAY8 -> 8bit, 1 channel
@@ -703,10 +723,11 @@ bool GStreamerCapture::retrieveVideoFrame(int, OutputArray dst)
     //     video/x-raw, format={BGRA, RGBA, BGRx, RGBx} -> 8bit, 4 channels
     // bayer data is never decoded, the user is responsible for that
     Size sz = Size(frame_width, frame_height);
-    guint n_planes = GST_VIDEO_INFO_N_PLANES(&info);
+    guint n_planes = GST_VIDEO_FRAME_N_PLANES(&frame);
+
     if (name == "video/x-raw")
     {
-        const gchar* format_ = gst_structure_get_string(structure, "format");
+        const gchar* format_ = frame.info.finfo->name;
         if (!format_)
         {
             CV_LOG_ERROR(NULL, "GStreamer: Can't query 'format' of 'video/x-raw'");
@@ -717,97 +738,83 @@ bool GStreamerCapture::retrieveVideoFrame(int, OutputArray dst)
         if (format == "BGR")
         {
             CV_CheckEQ((int)n_planes, 1, "");
-            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step, (size_t)frame_width * 3, "");
-            Mat src(sz, CV_8UC3, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            Mat src(sz, CV_8UC3, GST_VIDEO_FRAME_PLANE_DATA(&frame, 0), step);
             src.copyTo(dst);
             return true;
         }
         else if (format == "GRAY8")
         {
             CV_CheckEQ((int)n_planes, 1, "");
-            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step, (size_t)frame_width, "");
-            Mat src(sz, CV_8UC1, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            Mat src(sz, CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame, 0), step);
             src.copyTo(dst);
             return true;
         }
         else if (format == "GRAY16_LE" || format == "GRAY16_BE")
         {
             CV_CheckEQ((int)n_planes, 1, "");
-            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step, (size_t)frame_width, "");
-            Mat src(sz, CV_16UC1, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            Mat src(sz, CV_16UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame, 0), step);
             src.copyTo(dst);
             return true;
         }
         else if (format == "BGRA" || format == "RGBA" || format == "BGRX" || format == "RGBX")
         {
             CV_CheckEQ((int)n_planes, 1, "");
-            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step, (size_t)frame_width, "");
-            Mat src(sz, CV_8UC4, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            Mat src(sz, CV_8UC4, GST_VIDEO_FRAME_PLANE_DATA(&frame, 0), step);
             src.copyTo(dst);
             return true;
         }
         else if (format == "UYVY" || format == "YUY2" || format == "YVYU")
         {
             CV_CheckEQ((int)n_planes, 1, "");
-            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step, (size_t)frame_width * 2, "");
-            Mat src(sz, CV_8UC2, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            Mat src(sz, CV_8UC2, GST_VIDEO_FRAME_PLANE_DATA(&frame, 0), step);
             src.copyTo(dst);
             return true;
         }
         else if (format == "NV12" || format == "NV21")
         {
             CV_CheckEQ((int)n_planes, 2, "");
-            size_t stepY = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t stepY = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(stepY, (size_t)frame_width, "");
-            size_t stepUV = GST_VIDEO_INFO_PLANE_STRIDE(&info, 1);
+            size_t stepUV = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 1);
             CV_CheckGE(stepUV, (size_t)frame_width, "");
-            size_t offsetY = GST_VIDEO_INFO_PLANE_OFFSET(&info, 0);
-            size_t offsetUV = GST_VIDEO_INFO_PLANE_OFFSET(&info, 1);
-            if (stepY != stepUV || (offsetUV - offsetY) != (stepY * frame_height))
-            {
-                dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
-                Mat dst_ = dst.getMat();
-                Mat srcY(sz, CV_8UC1, map_info.data + offsetY, stepY);
-                Mat srcUV(Size(frame_width, frame_height / 2), CV_8UC1, map_info.data + offsetUV, stepUV);
-                srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
-                srcUV.copyTo(dst_(Rect(0, frame_height, frame_width, frame_height / 2)));
-            }
-            else
-            {
-                Mat src(Size(frame_width, frame_height * 3 / 2), CV_8UC1, map_info.data + offsetY, stepY);
-                src.copyTo(dst);
-            }
+
+            dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
+            Mat dst_ = dst.getMat();
+            Mat srcY(sz, CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame,0), stepY);
+            Mat srcUV(Size(frame_width, frame_height / 2), CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame,1), stepUV);
+            srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
+            srcUV.copyTo(dst_(Rect(0, frame_height, frame_width, frame_height / 2)));
             return true;
         }
         else if (format == "YV12" || format == "I420")
         {
             CV_CheckEQ((int)n_planes, 3, "");
-            size_t step0 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            size_t step0 = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 0);
             CV_CheckGE(step0, (size_t)frame_width, "");
-            size_t step1 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 1);
+            size_t step1 = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 1);
             CV_CheckGE(step1, (size_t)frame_width / 2, "");
-            size_t step2 = GST_VIDEO_INFO_PLANE_STRIDE(&info, 2);
+            size_t step2 = GST_VIDEO_FRAME_PLANE_STRIDE(&frame, 2);
             CV_CheckGE(step2, (size_t)frame_width / 2, "");
 
-            size_t offset0 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 0);
-            size_t offset1 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 1);
-            size_t offset2 = GST_VIDEO_INFO_PLANE_OFFSET(&info, 2);
-            {
-                dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
-                Mat dst_ = dst.getMat();
-                Mat srcY(sz, CV_8UC1, map_info.data + offset0, step0);
-                Size sz2(frame_width / 2, frame_height / 2);
-                Mat src1(sz2, CV_8UC1, map_info.data + offset1, step1);
-                Mat src2(sz2, CV_8UC1, map_info.data + offset2, step2);
-                srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
-                src1.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height)));
-                src2.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height) + src1.total()));
-            }
+            dst.create(Size(frame_width, frame_height * 3 / 2), CV_8UC1);
+            Mat dst_ = dst.getMat();
+            Mat srcY(sz, CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame,0), step0);
+            Size sz2(frame_width / 2, frame_height / 2);
+            Mat src1(sz2, CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame,1), step1);
+            Mat src2(sz2, CV_8UC1, GST_VIDEO_FRAME_PLANE_DATA(&frame,2), step2);
+            srcY.copyTo(dst_(Rect(0, 0, frame_width, frame_height)));
+            src1.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height)));
+            src2.copyTo(Mat(sz2, CV_8UC1, dst_.ptr<uchar>(frame_height) + src1.total()));
             return true;
         }
         else
@@ -818,14 +825,14 @@ bool GStreamerCapture::retrieveVideoFrame(int, OutputArray dst)
     else if (name == "video/x-bayer")
     {
         CV_CheckEQ((int)n_planes, 0, "");
-        Mat src = Mat(sz, CV_8UC1, map_info.data);
+        Mat src = Mat(sz, CV_8UC1, frame.map[0].data);
         src.copyTo(dst);
         return true;
     }
     else if (name == "image/jpeg")
     {
         CV_CheckEQ((int)n_planes, 0, "");
-        Mat src = Mat(Size(map_info.size, 1), CV_8UC1, map_info.data);
+        Mat src = Mat(Size(frame.map[0].size, 1), CV_8UC1, frame.map[0].data);
         src.copyTo(dst);
         return true;
     }

From d1ccb7e47f5bc6d569e76b773f019eb07ab42065 Mon Sep 17 00:00:00 2001
From: Phil Ruffwind <rf@rufflewind.com>
Date: Fri, 8 Apr 2022 22:48:03 -0700
Subject: [PATCH 050/178] Update js_usage.markdown to use onRuntimeInitialized

The Emscripten library is not guaranteed to be fully loaded during the
script element's onload event. Module.onRuntimeInitialized seems to be
more reliable.
---
 doc/js_tutorials/js_setup/js_usage/js_usage.markdown | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
index 7992fd4096..c543b5dd79 100644
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@@ -122,11 +122,14 @@ imgElement.onload = function() {
   mat.delete();
 };
 
-function onOpenCvReady() {
-  document.getElementById('status').innerHTML = 'OpenCV.js is ready.';
-}
+var Module = {
+  // https://emscripten.org/docs/api_reference/module.html#Module.onRuntimeInitialized
+  onRuntimeInitialized() {
+    document.getElementById('status').innerHTML = 'OpenCV.js is ready.';
+  }
+};
 </script>
-<script async src="opencv.js" onload="onOpenCvReady();" type="text/javascript"></script>
+<script async src="opencv.js" type="text/javascript"></script>
 </body>
 </html>
 @endcode

From 50bf0e10f52b9c6000a9edbcdae9fdd1d8f68a11 Mon Sep 17 00:00:00 2001
From: Kataev Victor <kataev.victor.1234@gmail.com>
Date: Mon, 23 May 2022 14:06:41 +0300
Subject: [PATCH 051/178] Fix cameracalibration test

* Remove transpose for rvec after calibration
* Change compare threshold from 110% to 10% for perViewErrors
* Add transpose to goodRotMatrs load
---
 modules/calib3d/test/test_cameracalibration.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index 1c04bb78dd..27e0e83b8f 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -484,7 +484,8 @@ void CV_CameraCalibrationTest::run( int start_from )
             for( i = 0; i < 3; i++ )
                 for( j = 0; j < 3; j++ )
                 {
-                    values_read = fscanf(file, "%lf", &goodRotMatrs[currImage].val[i*3+j]);
+                    // Yes, load with transpose
+                    values_read = fscanf(file, "%lf", &goodRotMatrs[currImage].val[j*3+i]);
                     CV_Assert(values_read == 1);
                 }
         }
@@ -661,7 +662,7 @@ void CV_CameraCalibrationTest::run( int start_from )
 
         /* ----- Compare per view re-projection errors ----- */
         CV_Assert(perViewErrors.size() == (size_t)numImages);
-        code = compare(&perViewErrors[0], &goodPerViewErrors[0], numImages, 1.1, "per view errors vector");
+        code = compare(&perViewErrors[0], &goodPerViewErrors[0], numImages, 0.1, "per view errors vector");
         if( code < 0 )
             break;
 
@@ -792,7 +793,6 @@ void CV_CameraCalibrationTest_CPP::calibrate(Size imageSize,
     {
         Mat r9;
         cvtest::Rodrigues( rvecs[i], r9 );
-        cv::transpose(r9, r9);
         r9.convertTo(rotationMatrices[i], CV_64F);
         tvecs[i].convertTo(translationVectors[i], CV_64F);
     }

From 1ddc51cfd1cf19581206e7200c781d12326c1313 Mon Sep 17 00:00:00 2001
From: Kataev Victor <kataev.victor.1234@gmail.com>
Date: Mon, 23 May 2022 14:30:09 +0300
Subject: [PATCH 052/178] Update fisheye::distortPoints() documentation

---
 modules/calib3d/include/opencv2/calib3d.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index ad3527aa42..6bf10a418f 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -3799,8 +3799,7 @@ namespace fisheye
     @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
 
     Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
-    This means if you want to transform back points undistorted with #fisheye::undistortPoints you have to
-    multiply them with \f$P^{-1}\f$.
+    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$.
      */
     CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray K, InputArray D, double alpha = 0);
 

From 93dc0679ec93fe6a152f4f1c238da9cdc6aa00ed Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Mon, 23 May 2022 17:50:42 +0300
Subject: [PATCH 053/178] Merge pull request #21818 from rogday:revert_renaming

* add prefixes to layer names and layer output names

* dnn: OPENCV_DNN_ONNX_USE_LEGACY_NAMES runtime parameter

Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
---
 modules/dnn/src/onnx/onnx_importer.cpp  | 27 +++++++++++++++++++++----
 modules/dnn/test/test_onnx_importer.cpp |  5 +++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index e755226535..e4cbe02840 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -15,6 +15,9 @@
 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
 #include <opencv2/core/utils/logger.hpp>
 
+#include <opencv2/core/utils/configuration.private.hpp>
+
+
 #ifdef HAVE_PROTOBUF
 
 #include <iostream>
@@ -78,6 +81,7 @@ public:
     ONNXImporter(Net& net, const char *onnxFile)
         : dstNet(net), dispatch(buildDispatchMap())
         , onnx_opset(0)
+        , useLegacyNames(getParamUseLegacyNames())
     {
         hasDynamicShapes = false;
         CV_Assert(onnxFile);
@@ -100,6 +104,7 @@ public:
     ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer)
         : dstNet(net), dispatch(buildDispatchMap())
         , onnx_opset(0)
+        , useLegacyNames(getParamUseLegacyNames())
     {
         hasDynamicShapes = false;
         CV_LOG_DEBUG(NULL, "DNN/ONNX: processing in-memory ONNX model (" << sizeBuffer << " bytes)");
@@ -196,8 +201,18 @@ private:
 
     int onnx_opset;  // OperatorSetIdProto for 'onnx' domain
     void parseOperatorSet();
+
+
+    bool useLegacyNames;
+    bool getParamUseLegacyNames()
+    {
+        bool param = utils::getConfigurationParameterBool("OPENCV_DNN_ONNX_USE_LEGACY_NAMES", false);
+        return param;
+    }
+    const std::string extractNodeName(const opencv_onnx::NodeProto& node_proto);
 };
 
+
 inline void replaceLayerParam(LayerParams& layerParams, const String& oldKey, const String& newKey)
 {
     if (layerParams.has(oldKey)) {
@@ -720,12 +735,14 @@ void ONNXImporter::populateNet()
     CV_LOG_DEBUG(NULL, "DNN/ONNX: import completed!");
 }
 
-static
-const std::string& extractNodeName(const opencv_onnx::NodeProto& node_proto)
+const std::string ONNXImporter::extractNodeName(const opencv_onnx::NodeProto& node_proto)
 {
+    // We need to rework DNN outputs API, this is a workaround for #21698
     if (node_proto.has_name() && !node_proto.name().empty())
     {
-        return node_proto.name();
+        if (useLegacyNames)
+            return node_proto.name();
+        return cv::format("onnx_node!%s", node_proto.name().c_str());
     }
     for (int i = 0; i < node_proto.output_size(); ++i)
     {
@@ -735,7 +752,9 @@ const std::string& extractNodeName(const opencv_onnx::NodeProto& node_proto)
         // the second method is to use an empty string in place of an input or output name.
         if (!name.empty())
         {
-            return name;
+            if (useLegacyNames)
+                return name.c_str();
+            return cv::format("onnx_node_output_%d!%s", i, name.c_str());
         }
     }
     CV_Error(Error::StsAssert, "Couldn't deduce Node name.");
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index afe80efaf1..60473ede58 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1389,6 +1389,11 @@ TEST_P(Test_ONNX_layers, DivConst)
     testONNXModels("div_const");
 }
 
+TEST_P(Test_ONNX_layers, OutputRegistration)
+{
+    testONNXModels("output_registration", npy, 0, 0, false, true, 2);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers

From 50d7c61c010dfeaf2c124aa6a4f98d9a214b9fb1 Mon Sep 17 00:00:00 2001
From: berak <px1704@web.de>
Date: Mon, 23 May 2022 19:18:31 +0200
Subject: [PATCH 054/178] Update darknet_importer.cpp

make it more obvious, that this is a '404', not a 'parsing' problem
---
 modules/dnn/src/darknet/darknet_importer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/dnn/src/darknet/darknet_importer.cpp b/modules/dnn/src/darknet/darknet_importer.cpp
index 5d28dbd2e2..adbdd35b5d 100644
--- a/modules/dnn/src/darknet/darknet_importer.cpp
+++ b/modules/dnn/src/darknet/darknet_importer.cpp
@@ -207,7 +207,7 @@ Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= Str
     std::ifstream cfgStream(cfgFile.c_str());
     if (!cfgStream.is_open())
     {
-        CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+        CV_Error(cv::Error::StsParseError, "Failed to open NetParameter file: " + std::string(cfgFile));
     }
     if (darknetModel != String())
     {

From 400906b43314750931cc039b1cca8da4a5ab47f8 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 23 May 2022 19:18:02 +0000
Subject: [PATCH 055/178] pre: OpenCV 3.4.18 (version++)

---
 .../cross_referencing/tutorial_cross_referencing.markdown     | 4 ++--
 modules/core/include/opencv2/core/version.hpp                 | 4 ++--
 modules/dnn/include/opencv2/dnn/dnn.hpp                       | 4 ++--
 modules/python/package/setup.py                               | 2 +-
 platforms/android/build_sdk.py                                | 2 +-
 platforms/android/service/readme.txt                          | 2 +-
 platforms/maven/opencv-it/pom.xml                             | 2 +-
 platforms/maven/opencv/pom.xml                                | 2 +-
 platforms/maven/pom.xml                                       | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
index 3e4f222ff6..740f455c2b 100644
--- a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
+++ b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
@@ -39,14 +39,14 @@ Open your Doxyfile using your favorite text editor and search for the key
 `TAGFILES`. Change it as follows:
 
 @code
-TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/3.4.17
+TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/3.4.18
 @endcode
 
 If you had other definitions already, you can append the line using a `\`:
 
 @code
 TAGFILES = ./docs/doxygen-tags/libstdc++.tag=https://gcc.gnu.org/onlinedocs/libstdc++/latest-doxygen \
-           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/3.4.17
+           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/3.4.18
 @endcode
 
 Doxygen can now use the information from the tag file to link to the OpenCV
diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index e938a20408..4c08d4e3cb 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -7,8 +7,8 @@
 
 #define CV_VERSION_MAJOR    3
 #define CV_VERSION_MINOR    4
-#define CV_VERSION_REVISION 17
-#define CV_VERSION_STATUS   "-dev"
+#define CV_VERSION_REVISION 18
+#define CV_VERSION_STATUS   "-pre"
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 00a147e260..2368313c64 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -47,9 +47,9 @@
 #include "opencv2/core/async.hpp"
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
-#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v24 {
+#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v25 {
 #define CV__DNN_EXPERIMENTAL_NS_END }
-namespace cv { namespace dnn { namespace experimental_dnn_34_v24 { } using namespace experimental_dnn_34_v24; }}
+namespace cv { namespace dnn { namespace experimental_dnn_34_v25 { } using namespace experimental_dnn_34_v25; }}
 #else
 #define CV__DNN_EXPERIMENTAL_NS_BEGIN
 #define CV__DNN_EXPERIMENTAL_NS_END
diff --git a/modules/python/package/setup.py b/modules/python/package/setup.py
index bd69de8fde..ca8fe9138f 100644
--- a/modules/python/package/setup.py
+++ b/modules/python/package/setup.py
@@ -9,7 +9,7 @@ def main():
     os.chdir(SCRIPT_DIR)
 
     package_name = 'opencv'
-    package_version = os.environ.get('OPENCV_VERSION', '3.4.17')  # TODO
+    package_version = os.environ.get('OPENCV_VERSION', '3.4.18')  # TODO
 
     long_description = 'Open Source Computer Vision Library Python bindings'  # TODO
 
diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 75fc680615..7974909f05 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -269,7 +269,7 @@ class Builder:
         # Add extra data
         apkxmldest = check_dir(os.path.join(apkdest, "res", "xml"), create=True)
         apklibdest = check_dir(os.path.join(apkdest, "libs", abi.name), create=True)
-        for ver, d in self.extra_packs + [("3.4.17", os.path.join(self.libdest, "lib"))]:
+        for ver, d in self.extra_packs + [("3.4.18", os.path.join(self.libdest, "lib"))]:
             r = ET.Element("library", attrib={"version": ver})
             log.info("Adding libraries from %s", d)
 
diff --git a/platforms/android/service/readme.txt b/platforms/android/service/readme.txt
index 7abc897b03..7be50dd205 100644
--- a/platforms/android/service/readme.txt
+++ b/platforms/android/service/readme.txt
@@ -12,7 +12,7 @@ manually using adb tool:
 
     adb install <path-to-OpenCV-sdk>/apk/OpenCV_<version>_Manager_<app_version>_<platform>.apk
 
-Example: OpenCV_3.4.17-dev_Manager_3.49_armeabi-v7a.apk
+Example: OpenCV_3.4.18-dev_Manager_3.49_armeabi-v7a.apk
 
 Use the list of platforms below to determine proper OpenCV Manager package for your device:
 
diff --git a/platforms/maven/opencv-it/pom.xml b/platforms/maven/opencv-it/pom.xml
index ae9b61da4c..a344b91e90 100644
--- a/platforms/maven/opencv-it/pom.xml
+++ b/platforms/maven/opencv-it/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>3.4.17</version>
+        <version>3.4.18</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-it</artifactId>
diff --git a/platforms/maven/opencv/pom.xml b/platforms/maven/opencv/pom.xml
index a974019d64..feb3e97aaa 100644
--- a/platforms/maven/opencv/pom.xml
+++ b/platforms/maven/opencv/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>3.4.17</version>
+        <version>3.4.18</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv</artifactId>
diff --git a/platforms/maven/pom.xml b/platforms/maven/pom.xml
index ac554de429..e5bf9264d3 100644
--- a/platforms/maven/pom.xml
+++ b/platforms/maven/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-parent</artifactId>
-    <version>3.4.17</version>
+    <version>3.4.18</version>
     <packaging>pom</packaging>
     <name>OpenCV Parent POM</name>
     <licenses>

From e9428726ca140d41a8f68017d3b0cf3d85479d76 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 23 May 2022 19:25:16 +0000
Subject: [PATCH 056/178] pre: OpenCV 4.6.0 (version++)

---
 .../cross_referencing/tutorial_cross_referencing.markdown   | 4 ++--
 modules/core/include/opencv2/core/version.hpp               | 6 +++---
 modules/dnn/include/opencv2/dnn/version.hpp                 | 2 +-
 modules/python/package/setup.py                             | 2 +-
 platforms/maven/opencv-it/pom.xml                           | 2 +-
 platforms/maven/opencv/pom.xml                              | 2 +-
 platforms/maven/pom.xml                                     | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
index dfd244fbed..ccef42a482 100644
--- a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
+++ b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
@@ -46,14 +46,14 @@ Open your Doxyfile using your favorite text editor and search for the key
 `TAGFILES`. Change it as follows:
 
 @code
-TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.5.5
+TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.6.0
 @endcode
 
 If you had other definitions already, you can append the line using a `\`:
 
 @code
 TAGFILES = ./docs/doxygen-tags/libstdc++.tag=https://gcc.gnu.org/onlinedocs/libstdc++/latest-doxygen \
-           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.5.5
+           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.6.0
 @endcode
 
 Doxygen can now use the information from the tag file to link to the OpenCV
diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index d0c01e8fea..1832aa75b8 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -6,9 +6,9 @@
 #define OPENCV_VERSION_HPP
 
 #define CV_VERSION_MAJOR    4
-#define CV_VERSION_MINOR    5
-#define CV_VERSION_REVISION 5
-#define CV_VERSION_STATUS   "-dev"
+#define CV_VERSION_MINOR    6
+#define CV_VERSION_REVISION 0
+#define CV_VERSION_STATUS   "-pre"
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index a0a1754901..630cccf758 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20211220
+#define OPENCV_DNN_API_VERSION 20220524
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/python/package/setup.py b/modules/python/package/setup.py
index 191ab4e77d..9736e78db8 100644
--- a/modules/python/package/setup.py
+++ b/modules/python/package/setup.py
@@ -9,7 +9,7 @@ def main():
     os.chdir(SCRIPT_DIR)
 
     package_name = 'opencv'
-    package_version = os.environ.get('OPENCV_VERSION', '4.5.5')  # TODO
+    package_version = os.environ.get('OPENCV_VERSION', '4.6.0')  # TODO
 
     long_description = 'Open Source Computer Vision Library Python bindings'  # TODO
 
diff --git a/platforms/maven/opencv-it/pom.xml b/platforms/maven/opencv-it/pom.xml
index a7dc090f38..1e66e6979a 100644
--- a/platforms/maven/opencv-it/pom.xml
+++ b/platforms/maven/opencv-it/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>4.5.5</version>
+        <version>4.6.0</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-it</artifactId>
diff --git a/platforms/maven/opencv/pom.xml b/platforms/maven/opencv/pom.xml
index 9b081e4d1f..c135d4b75f 100644
--- a/platforms/maven/opencv/pom.xml
+++ b/platforms/maven/opencv/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>4.5.5</version>
+        <version>4.6.0</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv</artifactId>
diff --git a/platforms/maven/pom.xml b/platforms/maven/pom.xml
index 453b79f4ec..de42826a07 100644
--- a/platforms/maven/pom.xml
+++ b/platforms/maven/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-parent</artifactId>
-    <version>4.5.5</version>
+    <version>4.6.0</version>
     <packaging>pom</packaging>
     <name>OpenCV Parent POM</name>
     <licenses>

From a2ad997e972dd6b0535633996221b0dc897d7415 Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Wed, 18 May 2022 23:52:56 +0300
Subject: [PATCH 057/178] fix vector access in TF::sortByExecutionOrder

---
 .../src/tensorflow/tf_graph_simplifier.cpp    | 22 +++++++--------
 modules/dnn/test/test_tf_importer.cpp         | 28 ++++++++++++++-----
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index f81ff83ab0..fc0fc94d76 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -990,6 +990,7 @@ void sortByExecutionOrder(tensorflow::GraphDef& net)
         nodesMap.insert(std::make_pair(node.name(), i));
     }
 
+    CV_CheckEQ(nodesMap.size(), (size_t)net.node_size(), "Node names must be unique");
     // Indices of nodes which use specific node as input.
     std::vector<std::vector<int> > edges(nodesMap.size());
     std::vector<int> numRefsToAdd(nodesMap.size(), 0);
@@ -1007,7 +1008,7 @@ void sortByExecutionOrder(tensorflow::GraphDef& net)
             nodesMapIt = nodesMap.find(inpName);
             if (nodesMapIt != nodesMap.end())
             {
-                edges[nodesMapIt->second].push_back(i);
+                edges.at(nodesMapIt->second).push_back(i);
                 numInputsInGraph += 1;
             }
         }
@@ -1019,11 +1020,11 @@ void sortByExecutionOrder(tensorflow::GraphDef& net)
             {
                 int numControlEdges = 0;
                 for (int j = 0; j < numInputsInGraph; ++j)
-                    numControlEdges += node.input(j)[0] == '^';
-                numRefsToAdd[i] = numControlEdges + 1;
+                    numControlEdges += node.input(j).at(0) == '^';
+                numRefsToAdd.at(i) = numControlEdges + 1;
             }
             else
-                numRefsToAdd[i] = numInputsInGraph;
+                numRefsToAdd.at(i) = numInputsInGraph;
         }
     }
 
@@ -1035,17 +1036,16 @@ void sortByExecutionOrder(tensorflow::GraphDef& net)
         nodesToAdd.pop_back();
 
         permIds.push_back(nodeToAdd);
-        CV_Assert(nodeToAdd < edges.size());
-        for (int i = 0; i < edges[nodeToAdd].size(); ++i)
+        for (int i = 0; i < edges.at(nodeToAdd).size(); ++i)
         {
-            int consumerId = edges[nodeToAdd][i];
-            if (numRefsToAdd[consumerId] > 0)
+            int consumerId = edges.at(nodeToAdd).at(i);
+            if (numRefsToAdd.at(consumerId) > 0)
             {
-                if (numRefsToAdd[consumerId] == 1)
+                if (numRefsToAdd.at(consumerId) == 1)
                     nodesToAdd.push_back(consumerId);
                 else
-                    CV_Assert(numRefsToAdd[consumerId] >= 0);
-                numRefsToAdd[consumerId] -= 1;
+                    CV_Assert(numRefsToAdd.at(consumerId) >= 0);
+                numRefsToAdd.at(consumerId) -= 1;
             }
         }
     }
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 81880fb66c..d21a5589f9 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -1571,13 +1571,6 @@ TEST_P(Test_TensorFlow_layers, tf2_permute_nhwc_ncwh)
     runTensorFlowNet("tf2_permute_nhwc_ncwh");
 }
 
-// issue #21852
-TEST_P(Test_TensorFlow_layers, tf_graph_simplifier_buffer_overflow)
-{
-    // This just shouldn't segfault, otherwise it's fine
-    EXPECT_ANY_THROW(readNetFromTensorflow(path("tf_graph_simplifier_buffer_overflow_net.pb")));
-}
-
 TEST_P(Test_TensorFlow_layers, squeeze)
 {
 #if defined(INF_ENGINE_RELEASE)
@@ -1743,4 +1736,25 @@ TEST_P(Test_TensorFlow_nets, EfficientDet)
     expectNoFallbacksFromIE(net);
 }
 
+TEST(Test_TensorFlow_Importer, tf_graph_simplifier_buffer_overflow_21852)
+{
+    uint8_t payload[] = {0x08, 0x08, 0x0a, 0x00, 0x0a, 0x00};
+    EXPECT_ANY_THROW(readNetFromTensorflow(reinterpret_cast<const char*>(payload), sizeof(payload) / sizeof(payload[0])));
+}
+
+// can be triggered with -fsanitize=address
+TEST(Test_TensorFlow_Importer, tf_graph_simplifier_buffer_overflow_21947)
+{
+    uint8_t payload[] = {0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00,
+                                0xba, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00,
+                                0x0a, 0xbd, 0x00, 0x1a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0xba,
+                                0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00,
+                                0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0xba, 0x0a, 0x00,
+                                0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0xba,
+                                0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00,
+                                0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x2a, 0x00, 0xba, 0x0a, 0x00,
+                                0x0a, 0x00, 0x5d, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x0a, 0x40};
+    EXPECT_ANY_THROW(readNetFromTensorflow(reinterpret_cast<const char*>(payload), sizeof(payload) / sizeof(payload[0])));
+}
+
 }

From 434c96e625f0b2b0681cf63b91c31f3da5ac8559 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 23 May 2022 19:37:16 +0000
Subject: [PATCH 058/178] build: eliminate warnings

---
 modules/calib3d/perf/perf_undistort.cpp | 22 +++++++++-------------
 modules/calib3d/src/fisheye.cpp         |  2 +-
 modules/calib3d/test/test_fisheye.cpp   | 12 ++++++------
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/modules/calib3d/perf/perf_undistort.cpp b/modules/calib3d/perf/perf_undistort.cpp
index 86f622a92f..fa0d7ea2ec 100644
--- a/modules/calib3d/perf/perf_undistort.cpp
+++ b/modules/calib3d/perf/perf_undistort.cpp
@@ -27,31 +27,27 @@ PERF_TEST(Undistort, DISABLED_InitInverseRectificationMap)
     SANITY_CHECK_NOTHING();
 }
 
-using PerfIntType = perf::TestBaseWithParam<std::tuple<int>>;
-PERF_TEST_P(PerfIntType, fisheye_undistortPoints,
-                                            (testing::Values(1e2, 1e3, 1e4)))
+PERF_TEST(Undistort, fisheye_undistortPoints_100k_10iter)
 {
-    const cv::Size imageSize(1280, 800);
+    const int pointsNumber = 100000;
+    const Size imageSize(1280, 800);
 
     /* Set camera matrix */
-    const cv::Matx33d K(558.478087865323,  0, 620.458515360843,
+    const Matx33d K(558.478087865323,  0, 620.458515360843,
                          0, 560.506767351568, 381.939424848348,
                          0,               0,                1);
 
     /* Set distortion coefficients */
-    Mat D(1, 4, CV_64F);
-    theRNG().fill(D, RNG::UNIFORM, -1.e-5, 1.e-5);
-
-    int pointsNumber = std::get<0>(GetParam());
+    const Matx14d D(2.81e-06, 1.31e-06, -4.42e-06, -1.25e-06);
 
     /* Create two-channel points matrix */
-    cv::Mat xy[2] = {};
+    Mat xy[2] = {};
     xy[0].create(pointsNumber, 1, CV_64F);
-    theRNG().fill(xy[0], cv::RNG::UNIFORM, 0, imageSize.width); // x
+    theRNG().fill(xy[0], RNG::UNIFORM, 0, imageSize.width); // x
     xy[1].create(pointsNumber, 1, CV_64F);
-    theRNG().fill(xy[1], cv::RNG::UNIFORM, 0, imageSize.height); // y
+    theRNG().fill(xy[1], RNG::UNIFORM, 0, imageSize.height); // y
 
-    cv::Mat points;
+    Mat points;
     merge(xy, 2, points);
 
     /* Set fixed iteration number to check only c++ code, not algo convergence */
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 9a73132cbe..346638d4c1 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -375,7 +375,7 @@ void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted
     size_t n = distorted.total();
     int sdepth = distorted.depth();
 
-    const bool isEps = criteria.type & TermCriteria::EPS;
+    const bool isEps = (criteria.type & TermCriteria::EPS) != 0;
 
     /* Define max count for solver iterations */
     int maxCount = std::numeric_limits<int>::max();
diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index 7e40493db3..23cfa98889 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -131,20 +131,20 @@ TEST_F(fisheyeTest, distortUndistortPoints)
 
     /* Test with random D set */
     for (size_t i = 0; i < 10; ++i) {
-        cv::Mat D(1, 4, CV_64F);
-        theRNG().fill(D, cv::RNG::UNIFORM, -0.00001, 0.00001);
+        cv::Mat distortion(1, 4, CV_64F);
+        theRNG().fill(distortion, cv::RNG::UNIFORM, -0.00001, 0.00001);
 
         /* Distort -> Undistort */
         cv::Mat distortedPoints;
-        cv::fisheye::distortPoints(points0, distortedPoints, K, D);
+        cv::fisheye::distortPoints(points0, distortedPoints, K, distortion);
         cv::Mat undistortedPoints;
-        cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, D);
+        cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, distortion);
 
         EXPECT_MAT_NEAR(points0, undistortedPoints, 1e-8);
 
         /* Undistort -> Distort */
-        cv::fisheye::undistortPoints(points0, undistortedPoints, K, D);
-        cv::fisheye::distortPoints(undistortedPoints, distortedPoints, K, D);
+        cv::fisheye::undistortPoints(points0, undistortedPoints, K, distortion);
+        cv::fisheye::distortPoints(undistortedPoints, distortedPoints, K, distortion);
 
         EXPECT_MAT_NEAR(points0, distortedPoints, 1e-8);
     }

From 6d098cc2304834d59ca74c1651c76193f74bf490 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 24 May 2022 23:44:10 +0000
Subject: [PATCH 059/178] ffmpeg/4.x: update FFmpeg wrapper 2022.05

- FFmpeg 4.4.2
- libvpx 1.11
---
 3rdparty/ffmpeg/ffmpeg.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/3rdparty/ffmpeg/ffmpeg.cmake b/3rdparty/ffmpeg/ffmpeg.cmake
index f6c1486598..6dcb24db6b 100644
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/4.x_20211220
-# Binaries were created for OpenCV: 0e274fc4bed8a64ba4f1c201a21304286e217afc
-ocv_update(FFMPEG_BINARIES_COMMIT "4d348507d156ec797a88a887cfa7f9129a35afac")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "eece4ec8304188117ffc7d5dfd0fc0ae")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "20deefbfe023c8b8d11a52e5a6527c6a")
+# Binaries branch name: ffmpeg/4.x_20220524
+# Binaries were created for OpenCV: d6e9616256b46bd59be0a93d397f6ab958d39cd2
+ocv_update(FFMPEG_BINARIES_COMMIT "65ec04d4573dcdfa4531f0b9e67f35d8ffff873e")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "5573e2262ad1298e603122b7759fc2f6")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "5f9e2b2e04c15f080f40e844de80c867")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
 
 function(download_win_ffmpeg script_var)

From e585453c2ec4a636f3cf0face3c33bf1d254297d Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 24 May 2022 23:35:45 +0000
Subject: [PATCH 060/178] videoio(test): skip failed tests in misconfigured
 environments

---
 modules/videoio/test/test_audio.cpp    |  4 ++++
 modules/videoio/test/test_video_io.cpp | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/modules/videoio/test/test_audio.cpp b/modules/videoio/test/test_audio.cpp
index b1eb0ed4b7..e077b5bff9 100644
--- a/modules/videoio/test/test_audio.cpp
+++ b/modules/videoio/test/test_audio.cpp
@@ -39,6 +39,10 @@ protected:
         ASSERT_EQ(expectedNumAudioCh, (int)audioData.size());
         for (unsigned int nCh = 0; nCh < audioData.size(); nCh++)
         {
+#ifdef _WIN32
+            if (audioData[nCh].size() == 132924 && numberOfSamples == 131819 && fileName == "test_audio.mp4")
+                throw SkipTestException("Detected failure observed on legacy Windows versions. SKIP");
+#endif
             ASSERT_EQ(numberOfSamples, audioData[nCh].size()) << "nCh=" << nCh;
             for (unsigned int i = 0; i < numberOfSamples; i++)
             {
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index f1ead09c23..6661b13c5a 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -713,6 +713,13 @@ TEST_P(videocapture_acceleration, read)
                 if (filename == "sample_322x242_15frames.yuv420p.libaom-av1.mp4")
                     throw SkipTestException("Unable to read the first frame with AV1 codec (missing support)");
             }
+#ifdef _WIN32
+            if (!read_umat_result && i == 1)
+            {
+                if (filename == "sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+                    throw SkipTestException("Unable to read the second frame with VP9 codec (media stack misconfiguration / outdated MSMF version)");
+            }
+#endif
             EXPECT_TRUE(read_umat_result);
             ASSERT_FALSE(umat.empty());
             umat.copyTo(frame);
@@ -728,6 +735,13 @@ TEST_P(videocapture_acceleration, read)
                 if (filename == "sample_322x242_15frames.yuv420p.libaom-av1.mp4")
                     throw SkipTestException("Unable to read the first frame with AV1 codec (missing support)");
             }
+#ifdef _WIN32
+            if (!read_result && i == 1)
+            {
+                if (filename == "sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+                    throw SkipTestException("Unable to read the second frame with VP9 codec (media stack misconfiguration / outdated MSMF version)");
+            }
+#endif
             EXPECT_TRUE(read_result);
         }
         ASSERT_FALSE(frame.empty());

From 72debee12504713bb3a03b259a6754e830a4f96a Mon Sep 17 00:00:00 2001
From: mohawk2 <mohawk2@users.noreply.github.com>
Date: Wed, 25 May 2022 15:23:10 +0100
Subject: [PATCH 061/178] typo: Minkowsky is actually Minkowski

---
 modules/flann/include/opencv2/flann.hpp    | 2 +-
 modules/flann/include/opencv2/flann/dist.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp
index 5c52bfeeb5..049685aecb 100644
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -116,7 +116,7 @@ cv::flann::L2 - Squared Euclidean distance functor, optimized version.
 
 cv::flann::L1 - Manhattan distance functor, optimized version.
 
-cv::flann::MinkowskiDistance -  The Minkowsky distance functor.
+cv::flann::MinkowskiDistance -  The Minkowski distance functor.
 This is highly optimised with loop unrolling.
 The computation of squared root at the end is omitted for efficiency.
 
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index 608f8a507a..2d7cbf13de 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -375,7 +375,7 @@ struct MinkowskiDistance
     MinkowskiDistance(int order_) : order(order_) {}
 
     /**
-     *  Compute the Minkowsky (L_p) distance between two vectors.
+     *  Compute the Minkowski (L_p) distance between two vectors.
      *
      *	This is highly optimised, with loop unrolling, as it is one
      *	of the most expensive inner loops.

From 24547f40ff6fb69ae87948369df335593a1b31f8 Mon Sep 17 00:00:00 2001
From: Namgoo Lee <devnglee@gmail.com>
Date: Thu, 26 May 2022 21:30:41 +0900
Subject: [PATCH 062/178] remove const from functions returning by value

---
 modules/core/include/opencv2/core/check.hpp                   | 4 ++--
 .../core/include/opencv2/core/utils/plugin_loader.private.hpp | 2 +-
 modules/core/src/check.cpp                                    | 4 ++--
 modules/core/src/ocl.cpp                                      | 2 +-
 modules/core/src/utils/plugin_loader.impl.hpp                 | 2 +-
 modules/dnn/src/onnx/onnx_importer.cpp                        | 4 ++--
 modules/dnn/src/tensorflow/tf_importer.cpp                    | 4 ++--
 modules/features2d/include/opencv2/features2d.hpp             | 4 ++--
 modules/features2d/src/matchers.cpp                           | 4 ++--
 modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp      | 2 +-
 modules/gapi/include/opencv2/gapi/own/types.hpp               | 2 +-
 modules/stitching/include/opencv2/stitching.hpp               | 4 ++--
 .../include/opencv2/stitching/detail/motion_estimators.hpp    | 2 +-
 13 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/modules/core/include/opencv2/core/check.hpp b/modules/core/include/opencv2/core/check.hpp
index d975223cc5..a32b8111ce 100644
--- a/modules/core/include/opencv2/core/check.hpp
+++ b/modules/core/include/opencv2/core/check.hpp
@@ -13,7 +13,7 @@ namespace cv {
 CV_EXPORTS const char* depthToString(int depth);
 
 /** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
-CV_EXPORTS const String typeToString(int type);
+CV_EXPORTS String typeToString(int type);
 
 
 //! @cond IGNORED
@@ -23,7 +23,7 @@ namespace detail {
 CV_EXPORTS const char* depthToString_(int depth);
 
 /** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
-CV_EXPORTS const cv::String typeToString_(int type);
+CV_EXPORTS cv::String typeToString_(int type);
 
 enum TestOp {
   TEST_CUSTOM = 0,
diff --git a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
index d6390fc74a..53b8c48c38 100644
--- a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
+++ b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
@@ -147,7 +147,7 @@ public:
         return handle != NULL;
     }
     void* getSymbol(const char* symbolName) const;
-    const std::string getName() const;
+    std::string getName() const;
 private:
     void libraryLoad(const FileSystemPath_t& filename);
     void libraryRelease();
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index 90df841e63..7f1310ad0a 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -14,7 +14,7 @@ const char* depthToString(int depth)
     return s ? s : "<invalid depth>";
 }
 
-const cv::String typeToString(int type)
+cv::String typeToString(int type)
 {
     cv::String s = detail::typeToString_(type);
     if (s.empty())
@@ -47,7 +47,7 @@ const char* depthToString_(int depth)
     return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL;
 }
 
-const cv::String typeToString_(int type)
+cv::String typeToString_(int type)
 {
     int depth = CV_MAT_DEPTH(type);
     int cn = CV_MAT_CN(type);
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 1c8d578fda..51acc68879 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -231,7 +231,7 @@ static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigura
 #endif
 );
 
-static const String getBuildExtraOptions()
+static String getBuildExtraOptions()
 {
     static String param_buildExtraOptions;
     static bool initialized = false;
diff --git a/modules/core/src/utils/plugin_loader.impl.hpp b/modules/core/src/utils/plugin_loader.impl.hpp
index 4173c9d802..be04b4c131 100644
--- a/modules/core/src/utils/plugin_loader.impl.hpp
+++ b/modules/core/src/utils/plugin_loader.impl.hpp
@@ -56,7 +56,7 @@ void* DynamicLib::getSymbol(const char* symbolName) const
     return res;
 }
 
-const std::string DynamicLib::getName() const
+std::string DynamicLib::getName() const
 {
     return toPrintablePath(fname);
 }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 4d4783805b..f06ff32dbe 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -205,7 +205,7 @@ private:
         bool param = utils::getConfigurationParameterBool("OPENCV_DNN_ONNX_USE_LEGACY_NAMES", false);
         return param;
     }
-    const std::string extractNodeName(const opencv_onnx::NodeProto& node_proto);
+    std::string extractNodeName(const opencv_onnx::NodeProto& node_proto);
 };
 
 
@@ -922,7 +922,7 @@ const ONNXImporter::DispatchMap& ONNXImporter::getDispatchMap(const opencv_onnx:
     return it->second;
 }
 
-const std::string ONNXImporter::extractNodeName(const opencv_onnx::NodeProto& node_proto)
+std::string ONNXImporter::extractNodeName(const opencv_onnx::NodeProto& node_proto)
 {
     // We need to rework DNN outputs API, this is a workaround for #21698
     if (node_proto.has_name() && !node_proto.name().empty())
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index b6d6a73013..cd413c1ad7 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -568,7 +568,7 @@ private:
     typedef std::map<std::string, TFImporterNodeParser> DispatchMap;
 
     const DispatchMap dispatch;
-    static const DispatchMap buildDispatchMap();
+    static DispatchMap buildDispatchMap();
 
     void parseConvolution        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseBias               (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
@@ -645,7 +645,7 @@ protected:
     TFImporter* importer;
 };
 
-const TFImporter::DispatchMap TFImporter::buildDispatchMap()
+TFImporter::DispatchMap TFImporter::buildDispatchMap()
 {
     static DispatchMap dispatch;
     dispatch["Conv2D"] = dispatch["SpaceToBatchND"] = dispatch["DepthwiseConv2dNative"] =
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 26c2b950c9..952d24ca0c 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -1143,8 +1143,8 @@ protected:
         virtual void clear();
 
         const Mat& getDescriptors() const;
-        const Mat getDescriptor( int imgIdx, int localDescIdx ) const;
-        const Mat getDescriptor( int globalDescIdx ) const;
+        Mat getDescriptor( int imgIdx, int localDescIdx ) const;
+        Mat getDescriptor( int globalDescIdx ) const;
         void getLocalIdx( int globalDescIdx, int& imgIdx, int& localDescIdx ) const;
 
         int size() const;
diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp
index 97875d4095..97d0a02417 100644
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@@ -475,7 +475,7 @@ void DescriptorMatcher::DescriptorCollection::clear()
     mergedDescriptors.release();
 }
 
-const Mat DescriptorMatcher::DescriptorCollection::getDescriptor( int imgIdx, int localDescIdx ) const
+Mat DescriptorMatcher::DescriptorCollection::getDescriptor( int imgIdx, int localDescIdx ) const
 {
     CV_Assert( imgIdx < (int)startIdxs.size() );
     int globalIdx = startIdxs[imgIdx] + localDescIdx;
@@ -489,7 +489,7 @@ const Mat& DescriptorMatcher::DescriptorCollection::getDescriptors() const
     return mergedDescriptors;
 }
 
-const Mat DescriptorMatcher::DescriptorCollection::getDescriptor( int globalDescIdx ) const
+Mat DescriptorMatcher::DescriptorCollection::getDescriptor( int globalDescIdx ) const
 {
     CV_Assert( globalDescIdx < size() );
     return mergedDescriptors.row( globalDescIdx );
diff --git a/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp b/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
index 53b68c4e21..92f1ccc87f 100644
--- a/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
@@ -181,7 +181,7 @@ template<> struct fluid_get_in<cv::GMat>
 template<> struct fluid_get_in<cv::GScalar>
 {
     // FIXME: change to return by reference when moved to own::Scalar
-    static const cv::Scalar get(const cv::GArgs &in_args, int idx)
+    static cv::Scalar get(const cv::GArgs &in_args, int idx)
     {
         return in_args[idx].unsafe_get<cv::Scalar>();
     }
diff --git a/modules/gapi/include/opencv2/gapi/own/types.hpp b/modules/gapi/include/opencv2/gapi/own/types.hpp
index 38143660bc..3ec9787839 100644
--- a/modules/gapi/include/opencv2/gapi/own/types.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/types.hpp
@@ -89,7 +89,7 @@ inline Rect& operator&=(Rect& lhs, const Rect& rhs)
     return lhs;
 }
 
-inline const Rect operator&(const Rect& lhs, const Rect& rhs)
+inline Rect operator&(const Rect& lhs, const Rect& rhs)
 {
     Rect result = lhs;
     return result &= rhs;
diff --git a/modules/stitching/include/opencv2/stitching.hpp b/modules/stitching/include/opencv2/stitching.hpp
index fb0ebe929d..3d1dbbd044 100644
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -205,12 +205,12 @@ public:
     void setWaveCorrectKind(detail::WaveCorrectKind kind) { wave_correct_kind_ = kind; }
 
     Ptr<Feature2D> featuresFinder() { return features_finder_; }
-    const Ptr<Feature2D> featuresFinder() const { return features_finder_; }
+    Ptr<Feature2D> featuresFinder() const { return features_finder_; }
     void setFeaturesFinder(Ptr<Feature2D> features_finder)
         { features_finder_ = features_finder; }
 
     Ptr<detail::FeaturesMatcher> featuresMatcher() { return features_matcher_; }
-    const Ptr<detail::FeaturesMatcher> featuresMatcher() const { return features_matcher_; }
+    Ptr<detail::FeaturesMatcher> featuresMatcher() const { return features_matcher_; }
     void setFeaturesMatcher(Ptr<detail::FeaturesMatcher> features_matcher)
         { features_matcher_ = features_matcher; }
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/motion_estimators.hpp b/modules/stitching/include/opencv2/stitching/detail/motion_estimators.hpp
index ad21ee1277..95919ea009 100644
--- a/modules/stitching/include/opencv2/stitching/detail/motion_estimators.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/motion_estimators.hpp
@@ -133,7 +133,7 @@ private:
 class CV_EXPORTS_W BundleAdjusterBase : public Estimator
 {
 public:
-    CV_WRAP const Mat refinementMask() const { return refinement_mask_.clone(); }
+    CV_WRAP Mat refinementMask() const { return refinement_mask_.clone(); }
     CV_WRAP void setRefinementMask(const Mat &mask)
     {
         CV_Assert(mask.type() == CV_8U && mask.size() == Size(3, 3));

From 8ca394efaf89c10753aa85af589560ea024c12c0 Mon Sep 17 00:00:00 2001
From: Lukas-Alexander Weber
 <32765578+lukasalexanderweber@users.noreply.github.com>
Date: Thu, 26 May 2022 21:30:24 +0200
Subject: [PATCH 063/178] Merge pull request #22005 from
 lukasalexanderweber:delete_stitching_tool

Move stitching package and tool to a dedicated repository

* deleted moved files

* Update README.md
---
 apps/opencv_stitching_tool/README.md          |   4 +-
 .../opencv_stitching/.gitignore               |   4 -
 .../opencv_stitching/__init__.py              |   0
 .../opencv_stitching/blender.py               |  56 ---
 .../opencv_stitching/camera_adjuster.py       |  49 ---
 .../opencv_stitching/camera_estimator.py      |  27 --
 .../opencv_stitching/camera_wave_corrector.py |  28 --
 .../opencv_stitching/cropper.py               | 149 -------
 .../exposure_error_compensator.py             |  40 --
 .../opencv_stitching/feature_detector.py      |  44 --
 .../opencv_stitching/feature_matcher.py       |  98 -----
 .../opencv_stitching/image_handler.py         | 105 -----
 .../largest_interior_rectangle.py             | 303 -------------
 .../opencv_stitching/megapix_scaler.py        |  38 --
 .../opencv_stitching/seam_finder.py           | 126 ------
 .../opencv_stitching/stitcher.py              | 236 ----------
 .../opencv_stitching/stitching_error.py       |   2 -
 .../opencv_stitching/subsetter.py             |  94 ----
 .../opencv_stitching/test/.gitignore          |  13 -
 .../test/SAMPLE_IMAGES_TO_DOWNLOAD.txt        |   5 -
 .../test/stitching_detailed.py                | 406 ------------------
 .../opencv_stitching/test/test_composition.py |  67 ---
 .../opencv_stitching/test/test_matcher.py     |  47 --
 .../test/test_megapix_scaler.py               |  58 ---
 .../opencv_stitching/test/test_performance.py |  65 ---
 .../test/test_registration.py                 | 100 -----
 .../opencv_stitching/test/test_stitcher.py    | 108 -----
 .../opencv_stitching/timelapser.py            |  50 ---
 .../opencv_stitching/warper.py                |  79 ----
 .../opencv_stitching_tool.py                  | 238 ----------
 30 files changed, 2 insertions(+), 2637 deletions(-)
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/.gitignore
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/__init__.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/blender.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/camera_adjuster.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/camera_estimator.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/camera_wave_corrector.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/cropper.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/exposure_error_compensator.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/feature_detector.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/feature_matcher.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/image_handler.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/largest_interior_rectangle.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/megapix_scaler.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/seam_finder.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/stitcher.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/stitching_error.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/subsetter.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/.gitignore
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/SAMPLE_IMAGES_TO_DOWNLOAD.txt
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/stitching_detailed.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_composition.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_matcher.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_megapix_scaler.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_performance.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_registration.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/test/test_stitcher.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/timelapser.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching/warper.py
 delete mode 100644 apps/opencv_stitching_tool/opencv_stitching_tool.py

diff --git a/apps/opencv_stitching_tool/README.md b/apps/opencv_stitching_tool/README.md
index 1cf3f019d0..2f4ce23625 100644
--- a/apps/opencv_stitching_tool/README.md
+++ b/apps/opencv_stitching_tool/README.md
@@ -1,3 +1,3 @@
-## In-Depth Stitching Tool for experiments and research
+## MOVED: opencv_stitching_tool
 
-Visit [opencv_stitching_tutorial](https://github.com/lukasalexanderweber/opencv_stitching_tutorial) for a detailed Tutorial
+As the stitching package is now available on [PyPI](https://pypi.org/project/stitching/) the tool and belonging package are now maintained [here](https://github.com/lukasalexanderweber/stitching). The Tutorial is maintained [here](https://github.com/lukasalexanderweber/stitching_tutorial).
diff --git a/apps/opencv_stitching_tool/opencv_stitching/.gitignore b/apps/opencv_stitching_tool/opencv_stitching/.gitignore
deleted file mode 100644
index 1f4d07f716..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# python binary files
-*.pyc
-__pycache__
-.pylint*
diff --git a/apps/opencv_stitching_tool/opencv_stitching/__init__.py b/apps/opencv_stitching_tool/opencv_stitching/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/apps/opencv_stitching_tool/opencv_stitching/blender.py b/apps/opencv_stitching_tool/opencv_stitching/blender.py
deleted file mode 100644
index 8c6d2c5f33..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/blender.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import cv2 as cv
-import numpy as np
-
-
-class Blender:
-
-    BLENDER_CHOICES = ('multiband', 'feather', 'no',)
-    DEFAULT_BLENDER = 'multiband'
-    DEFAULT_BLEND_STRENGTH = 5
-
-    def __init__(self, blender_type=DEFAULT_BLENDER,
-                 blend_strength=DEFAULT_BLEND_STRENGTH):
-        self.blender_type = blender_type
-        self.blend_strength = blend_strength
-        self.blender = None
-
-    def prepare(self, corners, sizes):
-        dst_sz = cv.detail.resultRoi(corners=corners, sizes=sizes)
-        blend_width = (np.sqrt(dst_sz[2] * dst_sz[3]) *
-                       self.blend_strength / 100)
-
-        if self.blender_type == 'no' or blend_width < 1:
-            self.blender = cv.detail.Blender_createDefault(
-                cv.detail.Blender_NO
-                )
-
-        elif self.blender_type == "multiband":
-            self.blender = cv.detail_MultiBandBlender()
-            self.blender.setNumBands(int((np.log(blend_width) /
-                                          np.log(2.) - 1.)))
-
-        elif self.blender_type == "feather":
-            self.blender = cv.detail_FeatherBlender()
-            self.blender.setSharpness(1. / blend_width)
-
-        self.blender.prepare(dst_sz)
-
-    def feed(self, img, mask, corner):
-        """https://docs.opencv.org/4.x/d6/d4a/classcv_1_1detail_1_1Blender.html#a64837308bcf4e414a6219beff6cbe37a"""  # noqa
-        self.blender.feed(cv.UMat(img.astype(np.int16)), mask, corner)
-
-    def blend(self):
-        """https://docs.opencv.org/4.x/d6/d4a/classcv_1_1detail_1_1Blender.html#aa0a91ce0d6046d3a63e0123cbb1b5c00"""  # noqa
-        result = None
-        result_mask = None
-        result, result_mask = self.blender.blend(result, result_mask)
-        result = cv.convertScaleAbs(result)
-        return result, result_mask
-
-    @classmethod
-    def create_panorama(cls, imgs, masks, corners, sizes):
-        blender = cls("no")
-        blender.prepare(corners, sizes)
-        for img, mask, corner in zip(imgs, masks, corners):
-            blender.feed(img, mask, corner)
-        return blender.blend()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/camera_adjuster.py b/apps/opencv_stitching_tool/opencv_stitching/camera_adjuster.py
deleted file mode 100644
index 684fd3d4fa..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/camera_adjuster.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-import numpy as np
-
-from .stitching_error import StitchingError
-
-
-class CameraAdjuster:
-    """https://docs.opencv.org/4.x/d5/d56/classcv_1_1detail_1_1BundleAdjusterBase.html"""  # noqa
-
-    CAMERA_ADJUSTER_CHOICES = OrderedDict()
-    CAMERA_ADJUSTER_CHOICES['ray'] = cv.detail_BundleAdjusterRay
-    CAMERA_ADJUSTER_CHOICES['reproj'] = cv.detail_BundleAdjusterReproj
-    CAMERA_ADJUSTER_CHOICES['affine'] = cv.detail_BundleAdjusterAffinePartial
-    CAMERA_ADJUSTER_CHOICES['no'] = cv.detail_NoBundleAdjuster
-
-    DEFAULT_CAMERA_ADJUSTER = list(CAMERA_ADJUSTER_CHOICES.keys())[0]
-    DEFAULT_REFINEMENT_MASK = "xxxxx"
-
-    def __init__(self,
-                 adjuster=DEFAULT_CAMERA_ADJUSTER,
-                 refinement_mask=DEFAULT_REFINEMENT_MASK):
-
-        self.adjuster = CameraAdjuster.CAMERA_ADJUSTER_CHOICES[adjuster]()
-        self.set_refinement_mask(refinement_mask)
-        self.adjuster.setConfThresh(1)
-
-    def set_refinement_mask(self, refinement_mask):
-        mask_matrix = np.zeros((3, 3), np.uint8)
-        if refinement_mask[0] == 'x':
-            mask_matrix[0, 0] = 1
-        if refinement_mask[1] == 'x':
-            mask_matrix[0, 1] = 1
-        if refinement_mask[2] == 'x':
-            mask_matrix[0, 2] = 1
-        if refinement_mask[3] == 'x':
-            mask_matrix[1, 1] = 1
-        if refinement_mask[4] == 'x':
-            mask_matrix[1, 2] = 1
-        self.adjuster.setRefinementMask(mask_matrix)
-
-    def adjust(self, features, pairwise_matches, estimated_cameras):
-        b, cameras = self.adjuster.apply(features,
-                                         pairwise_matches,
-                                         estimated_cameras)
-        if not b:
-            raise StitchingError("Camera parameters adjusting failed.")
-
-        return cameras
diff --git a/apps/opencv_stitching_tool/opencv_stitching/camera_estimator.py b/apps/opencv_stitching_tool/opencv_stitching/camera_estimator.py
deleted file mode 100644
index 8520eb0ddf..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/camera_estimator.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-import numpy as np
-
-from .stitching_error import StitchingError
-
-
-class CameraEstimator:
-
-    CAMERA_ESTIMATOR_CHOICES = OrderedDict()
-    CAMERA_ESTIMATOR_CHOICES['homography'] = cv.detail_HomographyBasedEstimator
-    CAMERA_ESTIMATOR_CHOICES['affine'] = cv.detail_AffineBasedEstimator
-
-    DEFAULT_CAMERA_ESTIMATOR = list(CAMERA_ESTIMATOR_CHOICES.keys())[0]
-
-    def __init__(self, estimator=DEFAULT_CAMERA_ESTIMATOR, **kwargs):
-        self.estimator = CameraEstimator.CAMERA_ESTIMATOR_CHOICES[estimator](
-            **kwargs
-            )
-
-    def estimate(self, features, pairwise_matches):
-        b, cameras = self.estimator.apply(features, pairwise_matches, None)
-        if not b:
-            raise StitchingError("Homography estimation failed.")
-        for cam in cameras:
-            cam.R = cam.R.astype(np.float32)
-        return cameras
diff --git a/apps/opencv_stitching_tool/opencv_stitching/camera_wave_corrector.py b/apps/opencv_stitching_tool/opencv_stitching/camera_wave_corrector.py
deleted file mode 100644
index 97b821b955..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/camera_wave_corrector.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-import numpy as np
-
-
-class WaveCorrector:
-    """https://docs.opencv.org/4.x/d7/d74/group__stitching__rotation.html#ga83b24d4c3e93584986a56d9e43b9cf7f"""  # noqa
-    WAVE_CORRECT_CHOICES = OrderedDict()
-    WAVE_CORRECT_CHOICES['horiz'] = cv.detail.WAVE_CORRECT_HORIZ
-    WAVE_CORRECT_CHOICES['vert'] = cv.detail.WAVE_CORRECT_VERT
-    WAVE_CORRECT_CHOICES['auto'] = cv.detail.WAVE_CORRECT_AUTO
-    WAVE_CORRECT_CHOICES['no'] = None
-
-    DEFAULT_WAVE_CORRECTION = list(WAVE_CORRECT_CHOICES.keys())[0]
-
-    def __init__(self, wave_correct_kind=DEFAULT_WAVE_CORRECTION):
-        self.wave_correct_kind = WaveCorrector.WAVE_CORRECT_CHOICES[
-            wave_correct_kind
-            ]
-
-    def correct(self, cameras):
-        if self.wave_correct_kind is not None:
-            rmats = [np.copy(cam.R) for cam in cameras]
-            rmats = cv.detail.waveCorrect(rmats, self.wave_correct_kind)
-            for idx, cam in enumerate(cameras):
-                cam.R = rmats[idx]
-            return cameras
-        return cameras
diff --git a/apps/opencv_stitching_tool/opencv_stitching/cropper.py b/apps/opencv_stitching_tool/opencv_stitching/cropper.py
deleted file mode 100644
index 243a6dc7b0..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/cropper.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from collections import namedtuple
-import cv2 as cv
-
-from .blender import Blender
-from .stitching_error import StitchingError
-
-
-class Rectangle(namedtuple('Rectangle', 'x y width height')):
-    __slots__ = ()
-
-    @property
-    def area(self):
-        return self.width * self.height
-
-    @property
-    def corner(self):
-        return (self.x, self.y)
-
-    @property
-    def size(self):
-        return (self.width, self.height)
-
-    @property
-    def x2(self):
-        return self.x + self.width
-
-    @property
-    def y2(self):
-        return self.y + self.height
-
-    def times(self, x):
-        return Rectangle(*(int(round(i*x)) for i in self))
-
-    def draw_on(self, img, color=(0, 0, 255), size=1):
-        if len(img.shape) == 2:
-            img = cv.cvtColor(img, cv.COLOR_GRAY2RGB)
-        start_point = (self.x, self.y)
-        end_point = (self.x2-1, self.y2-1)
-        cv.rectangle(img, start_point, end_point, color, size)
-        return img
-
-
-class Cropper:
-
-    DEFAULT_CROP = False
-
-    def __init__(self, crop=DEFAULT_CROP):
-        self.do_crop = crop
-        self.overlapping_rectangles = []
-        self.cropping_rectangles = []
-
-    def prepare(self, imgs, masks, corners, sizes):
-        if self.do_crop:
-            mask = self.estimate_panorama_mask(imgs, masks, corners, sizes)
-            self.compile_numba_functionality()
-            lir = self.estimate_largest_interior_rectangle(mask)
-            corners = self.get_zero_center_corners(corners)
-            rectangles = self.get_rectangles(corners, sizes)
-            self.overlapping_rectangles = self.get_overlaps(
-                rectangles, lir)
-            self.intersection_rectangles = self.get_intersections(
-                rectangles, self.overlapping_rectangles)
-
-    def crop_images(self, imgs, aspect=1):
-        for idx, img in enumerate(imgs):
-            yield self.crop_img(img, idx, aspect)
-
-    def crop_img(self, img, idx, aspect=1):
-        if self.do_crop:
-            intersection_rect = self.intersection_rectangles[idx]
-            scaled_intersection_rect = intersection_rect.times(aspect)
-            cropped_img = self.crop_rectangle(img, scaled_intersection_rect)
-            return cropped_img
-        return img
-
-    def crop_rois(self, corners, sizes, aspect=1):
-        if self.do_crop:
-            scaled_overlaps = \
-                [r.times(aspect) for r in self.overlapping_rectangles]
-            cropped_corners = [r.corner for r in scaled_overlaps]
-            cropped_corners = self.get_zero_center_corners(cropped_corners)
-            cropped_sizes = [r.size for r in scaled_overlaps]
-            return cropped_corners, cropped_sizes
-        return corners, sizes
-
-    @staticmethod
-    def estimate_panorama_mask(imgs, masks, corners, sizes):
-        _, mask = Blender.create_panorama(imgs, masks, corners, sizes)
-        return mask
-
-    def compile_numba_functionality(self):
-        # numba functionality is only imported if cropping
-        # is explicitely desired
-        try:
-            import numba
-        except ModuleNotFoundError:
-            raise StitchingError("Numba is needed for cropping but not installed")
-        from .largest_interior_rectangle import largest_interior_rectangle
-        self.largest_interior_rectangle = largest_interior_rectangle
-
-    def estimate_largest_interior_rectangle(self, mask):
-        lir = self.largest_interior_rectangle(mask)
-        lir = Rectangle(*lir)
-        return lir
-
-    @staticmethod
-    def get_zero_center_corners(corners):
-        min_corner_x = min([corner[0] for corner in corners])
-        min_corner_y = min([corner[1] for corner in corners])
-        return [(x - min_corner_x, y - min_corner_y) for x, y in corners]
-
-    @staticmethod
-    def get_rectangles(corners, sizes):
-        rectangles = []
-        for corner, size in zip(corners, sizes):
-            rectangle = Rectangle(*corner, *size)
-            rectangles.append(rectangle)
-        return rectangles
-
-    @staticmethod
-    def get_overlaps(rectangles, lir):
-        return [Cropper.get_overlap(r, lir) for r in rectangles]
-
-    @staticmethod
-    def get_overlap(rectangle1, rectangle2):
-        x1 = max(rectangle1.x, rectangle2.x)
-        y1 = max(rectangle1.y, rectangle2.y)
-        x2 = min(rectangle1.x2, rectangle2.x2)
-        y2 = min(rectangle1.y2, rectangle2.y2)
-        if x2 < x1 or y2 < y1:
-            raise StitchingError("Rectangles do not overlap!")
-        return Rectangle(x1, y1, x2-x1, y2-y1)
-
-    @staticmethod
-    def get_intersections(rectangles, overlapping_rectangles):
-        return [Cropper.get_intersection(r, overlap_r) for r, overlap_r
-                in zip(rectangles, overlapping_rectangles)]
-
-    @staticmethod
-    def get_intersection(rectangle, overlapping_rectangle):
-        x = abs(overlapping_rectangle.x - rectangle.x)
-        y = abs(overlapping_rectangle.y - rectangle.y)
-        width = overlapping_rectangle.width
-        height = overlapping_rectangle.height
-        return Rectangle(x, y, width, height)
-
-    @staticmethod
-    def crop_rectangle(img, rectangle):
-        return img[rectangle.y:rectangle.y2, rectangle.x:rectangle.x2]
diff --git a/apps/opencv_stitching_tool/opencv_stitching/exposure_error_compensator.py b/apps/opencv_stitching_tool/opencv_stitching/exposure_error_compensator.py
deleted file mode 100644
index f28fd83f14..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/exposure_error_compensator.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-
-
-class ExposureErrorCompensator:
-
-    COMPENSATOR_CHOICES = OrderedDict()
-    COMPENSATOR_CHOICES['gain_blocks'] = cv.detail.ExposureCompensator_GAIN_BLOCKS  # noqa
-    COMPENSATOR_CHOICES['gain'] = cv.detail.ExposureCompensator_GAIN
-    COMPENSATOR_CHOICES['channel'] = cv.detail.ExposureCompensator_CHANNELS
-    COMPENSATOR_CHOICES['channel_blocks'] = cv.detail.ExposureCompensator_CHANNELS_BLOCKS  # noqa
-    COMPENSATOR_CHOICES['no'] = cv.detail.ExposureCompensator_NO
-
-    DEFAULT_COMPENSATOR = list(COMPENSATOR_CHOICES.keys())[0]
-    DEFAULT_NR_FEEDS = 1
-    DEFAULT_BLOCK_SIZE = 32
-
-    def __init__(self,
-                 compensator=DEFAULT_COMPENSATOR,
-                 nr_feeds=DEFAULT_NR_FEEDS,
-                 block_size=DEFAULT_BLOCK_SIZE):
-
-        if compensator == 'channel':
-            self.compensator = cv.detail_ChannelsCompensator(nr_feeds)
-        elif compensator == 'channel_blocks':
-            self.compensator = cv.detail_BlocksChannelsCompensator(
-                block_size, block_size, nr_feeds
-                )
-        else:
-            self.compensator = cv.detail.ExposureCompensator_createDefault(
-                ExposureErrorCompensator.COMPENSATOR_CHOICES[compensator]
-                )
-
-    def feed(self, *args):
-        """https://docs.opencv.org/4.x/d2/d37/classcv_1_1detail_1_1ExposureCompensator.html#ae6b0cc69a7bc53818ddea53eddb6bdba"""  # noqa
-        self.compensator.feed(*args)
-
-    def apply(self, *args):
-        """https://docs.opencv.org/4.x/d2/d37/classcv_1_1detail_1_1ExposureCompensator.html#a473eaf1e585804c08d77c91e004f93aa"""  # noqa
-        return self.compensator.apply(*args)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/feature_detector.py b/apps/opencv_stitching_tool/opencv_stitching/feature_detector.py
deleted file mode 100644
index 995517b01b..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/feature_detector.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-
-
-class FeatureDetector:
-    DETECTOR_CHOICES = OrderedDict()
-    try:
-        cv.xfeatures2d_SURF.create()  # check if the function can be called
-        DETECTOR_CHOICES['surf'] = cv.xfeatures2d_SURF.create
-    except (AttributeError, cv.error):
-        print("SURF not available")
-
-    # if SURF not available, ORB is default
-    DETECTOR_CHOICES['orb'] = cv.ORB.create
-
-    try:
-        DETECTOR_CHOICES['sift'] = cv.SIFT_create
-    except AttributeError:
-        print("SIFT not available")
-
-    try:
-        DETECTOR_CHOICES['brisk'] = cv.BRISK_create
-    except AttributeError:
-        print("BRISK not available")
-
-    try:
-        DETECTOR_CHOICES['akaze'] = cv.AKAZE_create
-    except AttributeError:
-        print("AKAZE not available")
-
-    DEFAULT_DETECTOR = list(DETECTOR_CHOICES.keys())[0]
-
-    def __init__(self, detector=DEFAULT_DETECTOR, **kwargs):
-        self.detector = FeatureDetector.DETECTOR_CHOICES[detector](**kwargs)
-
-    def detect_features(self, img, *args, **kwargs):
-        return cv.detail.computeImageFeatures2(self.detector, img,
-                                               *args, **kwargs)
-
-    @staticmethod
-    def draw_keypoints(img, features, **kwargs):
-        kwargs.setdefault('color', (0, 255, 0))
-        keypoints = features.getKeypoints()
-        return cv.drawKeypoints(img, keypoints, None, **kwargs)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/feature_matcher.py b/apps/opencv_stitching_tool/opencv_stitching/feature_matcher.py
deleted file mode 100644
index 82ff153d4e..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/feature_matcher.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import math
-import cv2 as cv
-import numpy as np
-
-
-class FeatureMatcher:
-
-    MATCHER_CHOICES = ('homography', 'affine')
-    DEFAULT_MATCHER = 'homography'
-    DEFAULT_RANGE_WIDTH = -1
-
-    def __init__(self,
-                 matcher_type=DEFAULT_MATCHER,
-                 range_width=DEFAULT_RANGE_WIDTH,
-                 **kwargs):
-
-        if matcher_type == "affine":
-            """https://docs.opencv.org/4.x/d3/dda/classcv_1_1detail_1_1AffineBestOf2NearestMatcher.html"""  # noqa
-            self.matcher = cv.detail_AffineBestOf2NearestMatcher(**kwargs)
-        elif range_width == -1:
-            """https://docs.opencv.org/4.x/d4/d26/classcv_1_1detail_1_1BestOf2NearestMatcher.html"""  # noqa
-            self.matcher = cv.detail_BestOf2NearestMatcher(**kwargs)
-        else:
-            """https://docs.opencv.org/4.x/d8/d72/classcv_1_1detail_1_1BestOf2NearestRangeMatcher.html"""  # noqa
-            self.matcher = cv.detail_BestOf2NearestRangeMatcher(
-                range_width, **kwargs
-                )
-
-    def match_features(self, features, *args, **kwargs):
-        pairwise_matches = self.matcher.apply2(features, *args, **kwargs)
-        self.matcher.collectGarbage()
-        return pairwise_matches
-
-    @staticmethod
-    def draw_matches_matrix(imgs, features, matches, conf_thresh=1,
-                            inliers=False, **kwargs):
-        matches_matrix = FeatureMatcher.get_matches_matrix(matches)
-        for idx1, idx2 in FeatureMatcher.get_all_img_combinations(len(imgs)):
-            match = matches_matrix[idx1, idx2]
-            if match.confidence < conf_thresh:
-                continue
-            if inliers:
-                kwargs['matchesMask'] = match.getInliers()
-            yield idx1, idx2, FeatureMatcher.draw_matches(
-                imgs[idx1], features[idx1],
-                imgs[idx2], features[idx2],
-                match,
-                **kwargs
-                )
-
-    @staticmethod
-    def draw_matches(img1, features1, img2, features2, match1to2, **kwargs):
-        kwargs.setdefault('flags', cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
-
-        keypoints1 = features1.getKeypoints()
-        keypoints2 = features2.getKeypoints()
-        matches = match1to2.getMatches()
-
-        return cv.drawMatches(
-            img1, keypoints1, img2, keypoints2, matches, None, **kwargs
-            )
-
-    @staticmethod
-    def get_matches_matrix(pairwise_matches):
-        return FeatureMatcher.array_in_sqare_matrix(pairwise_matches)
-
-    @staticmethod
-    def get_confidence_matrix(pairwise_matches):
-        matches_matrix = FeatureMatcher.get_matches_matrix(pairwise_matches)
-        match_confs = [[m.confidence for m in row] for row in matches_matrix]
-        match_conf_matrix = np.array(match_confs)
-        return match_conf_matrix
-
-    @staticmethod
-    def array_in_sqare_matrix(array):
-        matrix_dimension = int(math.sqrt(len(array)))
-        rows = []
-        for i in range(0, len(array), matrix_dimension):
-            rows.append(array[i:i+matrix_dimension])
-        return np.array(rows)
-
-    def get_all_img_combinations(number_imgs):
-        ii, jj = np.triu_indices(number_imgs, k=1)
-        for i, j in zip(ii, jj):
-            yield i, j
-
-    @staticmethod
-    def get_match_conf(match_conf, feature_detector_type):
-        if match_conf is None:
-            match_conf = \
-                FeatureMatcher.get_default_match_conf(feature_detector_type)
-        return match_conf
-
-    @staticmethod
-    def get_default_match_conf(feature_detector_type):
-        if feature_detector_type == 'orb':
-            return 0.3
-        return 0.65
diff --git a/apps/opencv_stitching_tool/opencv_stitching/image_handler.py b/apps/opencv_stitching_tool/opencv_stitching/image_handler.py
deleted file mode 100644
index 3be9ff4817..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/image_handler.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import cv2 as cv
-
-from .megapix_scaler import MegapixDownscaler
-from .stitching_error import StitchingError
-
-class ImageHandler:
-
-    DEFAULT_MEDIUM_MEGAPIX = 0.6
-    DEFAULT_LOW_MEGAPIX = 0.1
-    DEFAULT_FINAL_MEGAPIX = -1
-
-    def __init__(self,
-                 medium_megapix=DEFAULT_MEDIUM_MEGAPIX,
-                 low_megapix=DEFAULT_LOW_MEGAPIX,
-                 final_megapix=DEFAULT_FINAL_MEGAPIX):
-
-        if medium_megapix < low_megapix:
-            raise StitchingError("Medium resolution megapix need to be "
-                                 "greater or equal than low resolution "
-                                 "megapix")
-
-        self.medium_scaler = MegapixDownscaler(medium_megapix)
-        self.low_scaler = MegapixDownscaler(low_megapix)
-        self.final_scaler = MegapixDownscaler(final_megapix)
-
-        self.scales_set = False
-        self.img_names = []
-        self.img_sizes = []
-
-    def set_img_names(self, img_names):
-        self.img_names = img_names
-
-    def resize_to_medium_resolution(self):
-        return self.read_and_resize_imgs(self.medium_scaler)
-
-    def resize_to_low_resolution(self, medium_imgs=None):
-        if medium_imgs and self.scales_set:
-            return self.resize_imgs_by_scaler(medium_imgs, self.low_scaler)
-        return self.read_and_resize_imgs(self.low_scaler)
-
-    def resize_to_final_resolution(self):
-        return self.read_and_resize_imgs(self.final_scaler)
-
-    def read_and_resize_imgs(self, scaler):
-        for img, size in self.input_images():
-            yield self.resize_img_by_scaler(scaler, size, img)
-
-    def resize_imgs_by_scaler(self, medium_imgs, scaler):
-        for img, size in zip(medium_imgs, self.img_sizes):
-            yield self.resize_img_by_scaler(scaler, size, img)
-
-    @staticmethod
-    def resize_img_by_scaler(scaler, size, img):
-        desired_size = scaler.get_scaled_img_size(size)
-        return cv.resize(img, desired_size,
-                         interpolation=cv.INTER_LINEAR_EXACT)
-
-    def input_images(self):
-        self.img_sizes = []
-        for name in self.img_names:
-            img = self.read_image(name)
-            size = self.get_image_size(img)
-            self.img_sizes.append(size)
-            self.set_scaler_scales()
-            yield img, size
-
-    @staticmethod
-    def get_image_size(img):
-        """(width, height)"""
-        return (img.shape[1], img.shape[0])
-
-    @staticmethod
-    def read_image(img_name):
-        img = cv.imread(img_name)
-        if img is None:
-            raise StitchingError("Cannot read image " + img_name)
-        return img
-
-    def set_scaler_scales(self):
-        if not self.scales_set:
-            first_img_size = self.img_sizes[0]
-            self.medium_scaler.set_scale_by_img_size(first_img_size)
-            self.low_scaler.set_scale_by_img_size(first_img_size)
-            self.final_scaler.set_scale_by_img_size(first_img_size)
-        self.scales_set = True
-
-    def get_medium_to_final_ratio(self):
-        return self.final_scaler.scale / self.medium_scaler.scale
-
-    def get_medium_to_low_ratio(self):
-        return self.low_scaler.scale / self.medium_scaler.scale
-
-    def get_final_to_low_ratio(self):
-        return self.low_scaler.scale / self.final_scaler.scale
-
-    def get_low_to_final_ratio(self):
-        return self.final_scaler.scale / self.low_scaler.scale
-
-    def get_final_img_sizes(self):
-        return [self.final_scaler.get_scaled_img_size(sz)
-                for sz in self.img_sizes]
-
-    def get_low_img_sizes(self):
-        return [self.low_scaler.get_scaled_img_size(sz)
-                for sz in self.img_sizes]
diff --git a/apps/opencv_stitching_tool/opencv_stitching/largest_interior_rectangle.py b/apps/opencv_stitching_tool/opencv_stitching/largest_interior_rectangle.py
deleted file mode 100644
index 5f0a82f7b9..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/largest_interior_rectangle.py
+++ /dev/null
@@ -1,303 +0,0 @@
-import numpy as np
-import numba as nb
-import cv2 as cv
-
-from .stitching_error import StitchingError
-
-
-def largest_interior_rectangle(cells):
-    outline = get_outline(cells)
-    adjacencies = adjacencies_all_directions(cells)
-    s_map, _, saddle_candidates_map = create_maps(outline, adjacencies)
-    lir1 = biggest_span_in_span_map(s_map)
-
-    candidate_cells = cells_of_interest(saddle_candidates_map)
-    s_map = span_map(adjacencies[0], adjacencies[2], candidate_cells)
-    lir2 = biggest_span_in_span_map(s_map)
-
-    lir = biggest_rectangle(lir1, lir2)
-    return lir
-
-
-def get_outline(cells):
-    contours, hierarchy = \
-        cv.findContours(cells, cv.RETR_TREE, cv.CHAIN_APPROX_NONE)
-    # TODO support multiple contours
-    # test that only one regular contour exists
-    if not hierarchy.shape == (1, 1, 4) or not np.all(hierarchy == -1):
-        raise StitchingError("Invalid Contour. Try without cropping.")
-    contour = contours[0][:, 0, :]
-    x_values = contour[:, 0].astype("uint32", order="C")
-    y_values = contour[:, 1].astype("uint32", order="C")
-    return x_values, y_values
-
-
-@nb.njit('uint32[:,::1](uint8[:,::1], boolean)', parallel=True, cache=True)
-def horizontal_adjacency(cells, direction):
-    result = np.zeros(cells.shape, dtype=np.uint32)
-    for y in nb.prange(cells.shape[0]):
-        span = 0
-        if direction:
-            iterator = range(cells.shape[1]-1, -1, -1)
-        else:
-            iterator = range(cells.shape[1])
-        for x in iterator:
-            if cells[y, x] > 0:
-                span += 1
-            else:
-                span = 0
-            result[y, x] = span
-    return result
-
-
-@nb.njit('uint32[:,::1](uint8[:,::1], boolean)', parallel=True, cache=True)
-def vertical_adjacency(cells, direction):
-    result = np.zeros(cells.shape, dtype=np.uint32)
-    for x in nb.prange(cells.shape[1]):
-        span = 0
-        if direction:
-            iterator = range(cells.shape[0]-1, -1, -1)
-        else:
-            iterator = range(cells.shape[0])
-        for y in iterator:
-            if cells[y, x] > 0:
-                span += 1
-            else:
-                span = 0
-            result[y, x] = span
-    return result
-
-
-@nb.njit(cache=True)
-def adjacencies_all_directions(cells):
-    h_left2right = horizontal_adjacency(cells, 1)
-    h_right2left = horizontal_adjacency(cells, 0)
-    v_top2bottom = vertical_adjacency(cells, 1)
-    v_bottom2top = vertical_adjacency(cells, 0)
-    return h_left2right, h_right2left, v_top2bottom, v_bottom2top
-
-
-@nb.njit('uint32(uint32[:])', cache=True)
-def predict_vector_size(array):
-    zero_indices = np.where(array == 0)[0]
-    if len(zero_indices) == 0:
-        if len(array) == 0:
-            return 0
-        return len(array)
-    return zero_indices[0]
-
-
-@nb.njit('uint32[:](uint32[:,::1], uint32, uint32)', cache=True)
-def h_vector_top2bottom(h_adjacency, x, y):
-    vector_size = predict_vector_size(h_adjacency[y:, x])
-    h_vector = np.zeros(vector_size, dtype=np.uint32)
-    h = np.Inf
-    for p in range(vector_size):
-        h = np.minimum(h_adjacency[y+p, x], h)
-        h_vector[p] = h
-    h_vector = np.unique(h_vector)[::-1]
-    return h_vector
-
-
-@nb.njit('uint32[:](uint32[:,::1], uint32, uint32)', cache=True)
-def h_vector_bottom2top(h_adjacency, x, y):
-    vector_size = predict_vector_size(np.flip(h_adjacency[:y+1, x]))
-    h_vector = np.zeros(vector_size, dtype=np.uint32)
-    h = np.Inf
-    for p in range(vector_size):
-        h = np.minimum(h_adjacency[y-p, x], h)
-        h_vector[p] = h
-    h_vector = np.unique(h_vector)[::-1]
-    return h_vector
-
-
-@nb.njit(cache=True)
-def h_vectors_all_directions(h_left2right, h_right2left, x, y):
-    h_l2r_t2b = h_vector_top2bottom(h_left2right, x, y)
-    h_r2l_t2b = h_vector_top2bottom(h_right2left, x, y)
-    h_l2r_b2t = h_vector_bottom2top(h_left2right, x, y)
-    h_r2l_b2t = h_vector_bottom2top(h_right2left, x, y)
-    return h_l2r_t2b, h_r2l_t2b, h_l2r_b2t, h_r2l_b2t
-
-
-@nb.njit('uint32[:](uint32[:,::1], uint32, uint32)', cache=True)
-def v_vector_left2right(v_adjacency, x, y):
-    vector_size = predict_vector_size(v_adjacency[y, x:])
-    v_vector = np.zeros(vector_size, dtype=np.uint32)
-    v = np.Inf
-    for q in range(vector_size):
-        v = np.minimum(v_adjacency[y, x+q], v)
-        v_vector[q] = v
-    v_vector = np.unique(v_vector)[::-1]
-    return v_vector
-
-
-@nb.njit('uint32[:](uint32[:,::1], uint32, uint32)', cache=True)
-def v_vector_right2left(v_adjacency, x, y):
-    vector_size = predict_vector_size(np.flip(v_adjacency[y, :x+1]))
-    v_vector = np.zeros(vector_size, dtype=np.uint32)
-    v = np.Inf
-    for q in range(vector_size):
-        v = np.minimum(v_adjacency[y, x-q], v)
-        v_vector[q] = v
-    v_vector = np.unique(v_vector)[::-1]
-    return v_vector
-
-
-@nb.njit(cache=True)
-def v_vectors_all_directions(v_top2bottom, v_bottom2top, x, y):
-    v_l2r_t2b = v_vector_left2right(v_top2bottom, x, y)
-    v_r2l_t2b = v_vector_right2left(v_top2bottom, x, y)
-    v_l2r_b2t = v_vector_left2right(v_bottom2top, x, y)
-    v_r2l_b2t = v_vector_right2left(v_bottom2top, x, y)
-    return v_l2r_t2b, v_r2l_t2b, v_l2r_b2t, v_r2l_b2t
-
-
-@nb.njit('uint32[:,:](uint32[:], uint32[:])', cache=True)
-def spans(h_vector, v_vector):
-    spans = np.stack((h_vector, v_vector[::-1]), axis=1)
-    return spans
-
-
-@nb.njit('uint32[:](uint32[:,:])', cache=True)
-def biggest_span(spans):
-    if len(spans) == 0:
-        return np.array([0, 0], dtype=np.uint32)
-    areas = spans[:, 0] * spans[:, 1]
-    biggest_span_index = np.where(areas == np.amax(areas))[0][0]
-    return spans[biggest_span_index]
-
-
-@nb.njit(cache=True)
-def spans_all_directions(h_vectors, v_vectors):
-    span_l2r_t2b = spans(h_vectors[0], v_vectors[0])
-    span_r2l_t2b = spans(h_vectors[1], v_vectors[1])
-    span_l2r_b2t = spans(h_vectors[2], v_vectors[2])
-    span_r2l_b2t = spans(h_vectors[3], v_vectors[3])
-    return span_l2r_t2b, span_r2l_t2b, span_l2r_b2t, span_r2l_b2t
-
-
-@nb.njit(cache=True)
-def get_n_directions(spans_all_directions):
-    n_directions = 1
-    for spans in spans_all_directions:
-        all_x_1 = np.all(spans[:, 0] == 1)
-        all_y_1 = np.all(spans[:, 1] == 1)
-        if not all_x_1 and not all_y_1:
-            n_directions += 1
-    return n_directions
-
-
-@nb.njit(cache=True)
-def get_xy_array(x, y, spans, mode=0):
-    """0 - flip none, 1 - flip x, 2 - flip y, 3 - flip both"""
-    xy = spans.copy()
-    xy[:, 0] = x
-    xy[:, 1] = y
-    if mode == 1:
-        xy[:, 0] = xy[:, 0] - spans[:, 0] + 1
-    if mode == 2:
-        xy[:, 1] = xy[:, 1] - spans[:, 1] + 1
-    if mode == 3:
-        xy[:, 0] = xy[:, 0] - spans[:, 0] + 1
-        xy[:, 1] = xy[:, 1] - spans[:, 1] + 1
-    return xy
-
-
-@nb.njit(cache=True)
-def get_xy_arrays(x, y, spans_all_directions):
-    xy_l2r_t2b = get_xy_array(x, y, spans_all_directions[0], 0)
-    xy_r2l_t2b = get_xy_array(x, y, spans_all_directions[1], 1)
-    xy_l2r_b2t = get_xy_array(x, y, spans_all_directions[2], 2)
-    xy_r2l_b2t = get_xy_array(x, y, spans_all_directions[3], 3)
-    return xy_l2r_t2b, xy_r2l_t2b, xy_l2r_b2t, xy_r2l_b2t
-
-
-@nb.njit(cache=True)
-def point_on_outline(x, y, outline):
-    x_vals, y_vals = outline
-    x_true = x_vals == x
-    y_true = y_vals == y
-    both_true = np.logical_and(x_true, y_true)
-    return np.any(both_true)
-
-
-@nb.njit('Tuple((uint32[:,:,::1], uint8[:,::1], uint8[:,::1]))'
-         '(UniTuple(uint32[:], 2), UniTuple(uint32[:,::1], 4))',
-         parallel=True, cache=True)
-def create_maps(outline, adjacencies):
-    x_values, y_values = outline
-    h_left2right, h_right2left, v_top2bottom, v_bottom2top = adjacencies
-
-    shape = h_left2right.shape
-    span_map = np.zeros(shape + (2,), "uint32")
-    direction_map = np.zeros(shape, "uint8")
-    saddle_candidates_map = np.zeros(shape, "uint8")
-
-    for idx in nb.prange(len(x_values)):
-        x, y = x_values[idx], y_values[idx]
-        h_vectors = h_vectors_all_directions(h_left2right, h_right2left, x, y)
-        v_vectors = v_vectors_all_directions(v_top2bottom, v_bottom2top, x, y)
-        span_arrays = spans_all_directions(h_vectors, v_vectors)
-        n = get_n_directions(span_arrays)
-        direction_map[y, x] = n
-        xy_arrays = get_xy_arrays(x, y, span_arrays)
-        for direction_idx in range(4):
-            xy_array = xy_arrays[direction_idx]
-            span_array = span_arrays[direction_idx]
-            for span_idx in range(span_array.shape[0]):
-                x, y = xy_array[span_idx][0], xy_array[span_idx][1]
-                w, h = span_array[span_idx][0], span_array[span_idx][1]
-                if w*h > span_map[y, x, 0] * span_map[y, x, 1]:
-                    span_map[y, x, :] = np.array([w, h], "uint32")
-                if n == 3 and not point_on_outline(x, y, outline):
-                    saddle_candidates_map[y, x] = np.uint8(255)
-
-    return span_map, direction_map, saddle_candidates_map
-
-
-def cells_of_interest(cells):
-    y_vals, x_vals = cells.nonzero()
-    x_vals = x_vals.astype("uint32", order="C")
-    y_vals = y_vals.astype("uint32", order="C")
-    return x_vals, y_vals
-
-
-@nb.njit('uint32[:, :, :]'
-         '(uint32[:,::1], uint32[:,::1], UniTuple(uint32[:], 2))',
-         parallel=True, cache=True)
-def span_map(h_adjacency_left2right,
-             v_adjacency_top2bottom,
-             cells_of_interest):
-
-    x_values, y_values = cells_of_interest
-
-    span_map = np.zeros(h_adjacency_left2right.shape + (2,), dtype=np.uint32)
-
-    for idx in nb.prange(len(x_values)):
-        x, y = x_values[idx], y_values[idx]
-        h_vector = h_vector_top2bottom(h_adjacency_left2right, x, y)
-        v_vector = v_vector_left2right(v_adjacency_top2bottom, x, y)
-        s = spans(h_vector, v_vector)
-        s = biggest_span(s)
-        span_map[y, x, :] = s
-
-    return span_map
-
-
-@nb.njit('uint32[:](uint32[:, :, :])', cache=True)
-def biggest_span_in_span_map(span_map):
-    areas = span_map[:, :, 0] * span_map[:, :, 1]
-    largest_rectangle_indices = np.where(areas == np.amax(areas))
-    x = largest_rectangle_indices[1][0]
-    y = largest_rectangle_indices[0][0]
-    span = span_map[y, x]
-    return np.array([x, y, span[0], span[1]], dtype=np.uint32)
-
-
-def biggest_rectangle(*args):
-    biggest_rect = np.array([0, 0, 0, 0], dtype=np.uint32)
-    for rect in args:
-        if rect[2] * rect[3] > biggest_rect[2] * biggest_rect[3]:
-            biggest_rect = rect
-    return biggest_rect
diff --git a/apps/opencv_stitching_tool/opencv_stitching/megapix_scaler.py b/apps/opencv_stitching_tool/opencv_stitching/megapix_scaler.py
deleted file mode 100644
index a7be8ad3dc..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/megapix_scaler.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import numpy as np
-
-
-class MegapixScaler:
-    def __init__(self, megapix):
-        self.megapix = megapix
-        self.is_scale_set = False
-        self.scale = None
-
-    def set_scale_by_img_size(self, img_size):
-        self.set_scale(
-            self.get_scale_by_resolution(img_size[0] * img_size[1])
-            )
-
-    def set_scale(self, scale):
-        self.scale = scale
-        self.is_scale_set = True
-
-    def get_scale_by_resolution(self, resolution):
-        if self.megapix > 0:
-            return np.sqrt(self.megapix * 1e6 / resolution)
-        return 1.0
-
-    def get_scaled_img_size(self, img_size):
-        width = int(round(img_size[0] * self.scale))
-        height = int(round(img_size[1] * self.scale))
-        return (width, height)
-
-
-class MegapixDownscaler(MegapixScaler):
-
-    @staticmethod
-    def force_downscale(scale):
-        return min(1.0, scale)
-
-    def set_scale(self, scale):
-        scale = self.force_downscale(scale)
-        super().set_scale(scale)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/seam_finder.py b/apps/opencv_stitching_tool/opencv_stitching/seam_finder.py
deleted file mode 100644
index 2b09264ad6..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/seam_finder.py
+++ /dev/null
@@ -1,126 +0,0 @@
-from collections import OrderedDict
-import cv2 as cv
-import numpy as np
-
-from .blender import Blender
-
-
-class SeamFinder:
-    """https://docs.opencv.org/4.x/d7/d09/classcv_1_1detail_1_1SeamFinder.html"""  # noqa
-    SEAM_FINDER_CHOICES = OrderedDict()
-    SEAM_FINDER_CHOICES['dp_color'] = cv.detail_DpSeamFinder('COLOR')
-    SEAM_FINDER_CHOICES['dp_colorgrad'] = cv.detail_DpSeamFinder('COLOR_GRAD')
-    SEAM_FINDER_CHOICES['voronoi'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_VORONOI_SEAM)  # noqa
-    SEAM_FINDER_CHOICES['no'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_NO)  # noqa
-
-    DEFAULT_SEAM_FINDER = list(SEAM_FINDER_CHOICES.keys())[0]
-
-    def __init__(self, finder=DEFAULT_SEAM_FINDER):
-        self.finder = SeamFinder.SEAM_FINDER_CHOICES[finder]
-
-    def find(self, imgs, corners, masks):
-        """https://docs.opencv.org/4.x/d0/dd5/classcv_1_1detail_1_1DpSeamFinder.html#a7914624907986f7a94dd424209a8a609"""  # noqa
-        imgs_float = [img.astype(np.float32) for img in imgs]
-        return self.finder.find(imgs_float, corners, masks)
-
-    @staticmethod
-    def resize(seam_mask, mask):
-        dilated_mask = cv.dilate(seam_mask, None)
-        resized_seam_mask = cv.resize(dilated_mask, (mask.shape[1],
-                                                     mask.shape[0]),
-                                      0, 0, cv.INTER_LINEAR_EXACT)
-        return cv.bitwise_and(resized_seam_mask, mask)
-
-    @staticmethod
-    def draw_seam_mask(img, seam_mask, color=(0, 0, 0)):
-        seam_mask = cv.UMat.get(seam_mask)
-        overlayed_img = np.copy(img)
-        overlayed_img[seam_mask == 0] = color
-        return overlayed_img
-
-    @staticmethod
-    def draw_seam_polygons(panorama, blended_seam_masks, alpha=0.5):
-        return add_weighted_image(panorama, blended_seam_masks, alpha)
-
-    @staticmethod
-    def draw_seam_lines(panorama, blended_seam_masks,
-                        linesize=1, color=(0, 0, 255)):
-        seam_lines = \
-            SeamFinder.exctract_seam_lines(blended_seam_masks, linesize)
-        panorama_with_seam_lines = panorama.copy()
-        panorama_with_seam_lines[seam_lines == 255] = color
-        return panorama_with_seam_lines
-
-    @staticmethod
-    def exctract_seam_lines(blended_seam_masks, linesize=1):
-        seam_lines = cv.Canny(np.uint8(blended_seam_masks), 100, 200)
-        seam_indices = (seam_lines == 255).nonzero()
-        seam_lines = remove_invalid_line_pixels(
-            seam_indices, seam_lines, blended_seam_masks
-            )
-        kernelsize = linesize + linesize - 1
-        kernel = np.ones((kernelsize, kernelsize), np.uint8)
-        return cv.dilate(seam_lines, kernel)
-
-    @staticmethod
-    def blend_seam_masks(seam_masks, corners, sizes):
-        imgs = colored_img_generator(sizes)
-        blended_seam_masks, _ = \
-            Blender.create_panorama(imgs, seam_masks, corners, sizes)
-        return blended_seam_masks
-
-
-def colored_img_generator(sizes, colors=(
-            (255, 000, 000),      # Blue
-            (000, 000, 255),      # Red
-            (000, 255, 000),      # Green
-            (000, 255, 255),      # Yellow
-            (255, 000, 255),      # Magenta
-            (128, 128, 255),      # Pink
-            (128, 128, 128),      # Gray
-            (000, 000, 128),      # Brown
-            (000, 128, 255))      # Orange
-            ):
-    for idx, size in enumerate(sizes):
-        if idx+1 > len(colors):
-            raise ValueError("Not enough default colors! Pass additional "
-                             "colors to \"colors\" parameter")
-        yield create_img_by_size(size, colors[idx])
-
-
-def create_img_by_size(size, color=(0, 0, 0)):
-    width, height = size
-    img = np.zeros((height, width, 3), np.uint8)
-    img[:] = color
-    return img
-
-
-def add_weighted_image(img1, img2, alpha):
-    return cv.addWeighted(
-        img1, alpha, img2, (1.0 - alpha), 0.0
-        )
-
-
-def remove_invalid_line_pixels(indices, lines, mask):
-    for x, y in zip(*indices):
-        if check_if_pixel_or_neighbor_is_black(mask, x, y):
-            lines[x, y] = 0
-    return lines
-
-
-def check_if_pixel_or_neighbor_is_black(img, x, y):
-    check = [is_pixel_black(img, x, y),
-             is_pixel_black(img, x+1, y), is_pixel_black(img, x-1, y),
-             is_pixel_black(img, x, y+1), is_pixel_black(img, x, y-1)]
-    return any(check)
-
-
-def is_pixel_black(img, x, y):
-    return np.all(get_pixel_value(img, x, y) == 0)
-
-
-def get_pixel_value(img, x, y):
-    try:
-        return img[x, y]
-    except IndexError:
-        pass
diff --git a/apps/opencv_stitching_tool/opencv_stitching/stitcher.py b/apps/opencv_stitching_tool/opencv_stitching/stitcher.py
deleted file mode 100644
index 2419092420..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/stitcher.py
+++ /dev/null
@@ -1,236 +0,0 @@
-from types import SimpleNamespace
-
-from .image_handler import ImageHandler
-from .feature_detector import FeatureDetector
-from .feature_matcher import FeatureMatcher
-from .subsetter import Subsetter
-from .camera_estimator import CameraEstimator
-from .camera_adjuster import CameraAdjuster
-from .camera_wave_corrector import WaveCorrector
-from .warper import Warper
-from .cropper import Cropper
-from .exposure_error_compensator import ExposureErrorCompensator
-from .seam_finder import SeamFinder
-from .blender import Blender
-from .timelapser import Timelapser
-from .stitching_error import StitchingError
-
-
-class Stitcher:
-    DEFAULT_SETTINGS = {
-         "medium_megapix": ImageHandler.DEFAULT_MEDIUM_MEGAPIX,
-         "detector": FeatureDetector.DEFAULT_DETECTOR,
-         "nfeatures": 500,
-         "matcher_type": FeatureMatcher.DEFAULT_MATCHER,
-         "range_width": FeatureMatcher.DEFAULT_RANGE_WIDTH,
-         "try_use_gpu": False,
-         "match_conf": None,
-         "confidence_threshold": Subsetter.DEFAULT_CONFIDENCE_THRESHOLD,
-         "matches_graph_dot_file": Subsetter.DEFAULT_MATCHES_GRAPH_DOT_FILE,
-         "estimator": CameraEstimator.DEFAULT_CAMERA_ESTIMATOR,
-         "adjuster": CameraAdjuster.DEFAULT_CAMERA_ADJUSTER,
-         "refinement_mask": CameraAdjuster.DEFAULT_REFINEMENT_MASK,
-         "wave_correct_kind": WaveCorrector.DEFAULT_WAVE_CORRECTION,
-         "warper_type": Warper.DEFAULT_WARP_TYPE,
-         "low_megapix": ImageHandler.DEFAULT_LOW_MEGAPIX,
-         "crop": Cropper.DEFAULT_CROP,
-         "compensator": ExposureErrorCompensator.DEFAULT_COMPENSATOR,
-         "nr_feeds": ExposureErrorCompensator.DEFAULT_NR_FEEDS,
-         "block_size": ExposureErrorCompensator.DEFAULT_BLOCK_SIZE,
-         "finder": SeamFinder.DEFAULT_SEAM_FINDER,
-         "final_megapix": ImageHandler.DEFAULT_FINAL_MEGAPIX,
-         "blender_type": Blender.DEFAULT_BLENDER,
-         "blend_strength": Blender.DEFAULT_BLEND_STRENGTH,
-         "timelapse": Timelapser.DEFAULT_TIMELAPSE}
-
-    def __init__(self, **kwargs):
-        self.initialize_stitcher(**kwargs)
-
-    def initialize_stitcher(self, **kwargs):
-        self.settings = Stitcher.DEFAULT_SETTINGS.copy()
-        self.validate_kwargs(kwargs)
-        self.settings.update(kwargs)
-
-        args = SimpleNamespace(**self.settings)
-        self.img_handler = ImageHandler(args.medium_megapix,
-                                        args.low_megapix,
-                                        args.final_megapix)
-        self.detector = \
-            FeatureDetector(args.detector, nfeatures=args.nfeatures)
-        match_conf = \
-            FeatureMatcher.get_match_conf(args.match_conf, args.detector)
-        self.matcher = FeatureMatcher(args.matcher_type, args.range_width,
-                                      try_use_gpu=args.try_use_gpu,
-                                      match_conf=match_conf)
-        self.subsetter = \
-            Subsetter(args.confidence_threshold, args.matches_graph_dot_file)
-        self.camera_estimator = CameraEstimator(args.estimator)
-        self.camera_adjuster = \
-            CameraAdjuster(args.adjuster, args.refinement_mask)
-        self.wave_corrector = WaveCorrector(args.wave_correct_kind)
-        self.warper = Warper(args.warper_type)
-        self.cropper = Cropper(args.crop)
-        self.compensator = \
-            ExposureErrorCompensator(args.compensator, args.nr_feeds,
-                                     args.block_size)
-        self.seam_finder = SeamFinder(args.finder)
-        self.blender = Blender(args.blender_type, args.blend_strength)
-        self.timelapser = Timelapser(args.timelapse)
-
-    def stitch(self, img_names):
-        self.initialize_registration(img_names)
-        imgs = self.resize_medium_resolution()
-        features = self.find_features(imgs)
-        matches = self.match_features(features)
-        imgs, features, matches = self.subset(imgs, features, matches)
-        cameras = self.estimate_camera_parameters(features, matches)
-        cameras = self.refine_camera_parameters(features, matches, cameras)
-        cameras = self.perform_wave_correction(cameras)
-        self.estimate_scale(cameras)
-
-        imgs = self.resize_low_resolution(imgs)
-        imgs, masks, corners, sizes = self.warp_low_resolution(imgs, cameras)
-        self.prepare_cropper(imgs, masks, corners, sizes)
-        imgs, masks, corners, sizes = \
-            self.crop_low_resolution(imgs, masks, corners, sizes)
-        self.estimate_exposure_errors(corners, imgs, masks)
-        seam_masks = self.find_seam_masks(imgs, corners, masks)
-
-        imgs = self.resize_final_resolution()
-        imgs, masks, corners, sizes = self.warp_final_resolution(imgs, cameras)
-        imgs, masks, corners, sizes = \
-            self.crop_final_resolution(imgs, masks, corners, sizes)
-        self.set_masks(masks)
-        imgs = self.compensate_exposure_errors(corners, imgs)
-        seam_masks = self.resize_seam_masks(seam_masks)
-
-        self.initialize_composition(corners, sizes)
-        self.blend_images(imgs, seam_masks, corners)
-        return self.create_final_panorama()
-
-    def initialize_registration(self, img_names):
-        self.img_handler.set_img_names(img_names)
-
-    def resize_medium_resolution(self):
-        return list(self.img_handler.resize_to_medium_resolution())
-
-    def find_features(self, imgs):
-        return [self.detector.detect_features(img) for img in imgs]
-
-    def match_features(self, features):
-        return self.matcher.match_features(features)
-
-    def subset(self, imgs, features, matches):
-        names, sizes, imgs, features, matches = \
-            self.subsetter.subset(self.img_handler.img_names,
-                                  self.img_handler.img_sizes,
-                                  imgs, features, matches)
-        self.img_handler.img_names, self.img_handler.img_sizes = names, sizes
-        return imgs, features, matches
-
-    def estimate_camera_parameters(self, features, matches):
-        return self.camera_estimator.estimate(features, matches)
-
-    def refine_camera_parameters(self, features, matches, cameras):
-        return self.camera_adjuster.adjust(features, matches, cameras)
-
-    def perform_wave_correction(self, cameras):
-        return self.wave_corrector.correct(cameras)
-
-    def estimate_scale(self, cameras):
-        self.warper.set_scale(cameras)
-
-    def resize_low_resolution(self, imgs=None):
-        return list(self.img_handler.resize_to_low_resolution(imgs))
-
-    def warp_low_resolution(self, imgs, cameras):
-        sizes = self.img_handler.get_low_img_sizes()
-        camera_aspect = self.img_handler.get_medium_to_low_ratio()
-        imgs, masks, corners, sizes = \
-            self.warp(imgs, cameras, sizes, camera_aspect)
-        return list(imgs), list(masks), corners, sizes
-
-    def warp_final_resolution(self, imgs, cameras):
-        sizes = self.img_handler.get_final_img_sizes()
-        camera_aspect = self.img_handler.get_medium_to_final_ratio()
-        return self.warp(imgs, cameras, sizes, camera_aspect)
-
-    def warp(self, imgs, cameras, sizes, aspect=1):
-        imgs = self.warper.warp_images(imgs, cameras, aspect)
-        masks = self.warper.create_and_warp_masks(sizes, cameras, aspect)
-        corners, sizes = self.warper.warp_rois(sizes, cameras, aspect)
-        return imgs, masks, corners, sizes
-
-    def prepare_cropper(self, imgs, masks, corners, sizes):
-        self.cropper.prepare(imgs, masks, corners, sizes)
-
-    def crop_low_resolution(self, imgs, masks, corners, sizes):
-        imgs, masks, corners, sizes = self.crop(imgs, masks, corners, sizes)
-        return list(imgs), list(masks), corners, sizes
-
-    def crop_final_resolution(self, imgs, masks, corners, sizes):
-        lir_aspect = self.img_handler.get_low_to_final_ratio()
-        return self.crop(imgs, masks, corners, sizes, lir_aspect)
-
-    def crop(self, imgs, masks, corners, sizes, aspect=1):
-        masks = self.cropper.crop_images(masks, aspect)
-        imgs = self.cropper.crop_images(imgs, aspect)
-        corners, sizes = self.cropper.crop_rois(corners, sizes, aspect)
-        return imgs, masks, corners, sizes
-
-    def estimate_exposure_errors(self, corners, imgs, masks):
-        self.compensator.feed(corners, imgs, masks)
-
-    def find_seam_masks(self, imgs, corners, masks):
-        return self.seam_finder.find(imgs, corners, masks)
-
-    def resize_final_resolution(self):
-        return self.img_handler.resize_to_final_resolution()
-
-    def compensate_exposure_errors(self, corners, imgs):
-        for idx, (corner, img) in enumerate(zip(corners, imgs)):
-            yield self.compensator.apply(idx, corner, img, self.get_mask(idx))
-
-    def resize_seam_masks(self, seam_masks):
-        for idx, seam_mask in enumerate(seam_masks):
-            yield SeamFinder.resize(seam_mask, self.get_mask(idx))
-
-    def set_masks(self, mask_generator):
-        self.masks = mask_generator
-        self.mask_index = -1
-
-    def get_mask(self, idx):
-        if idx == self.mask_index + 1:
-            self.mask_index += 1
-            self.mask = next(self.masks)
-            return self.mask
-        elif idx == self.mask_index:
-            return self.mask
-        else:
-            raise StitchingError("Invalid Mask Index!")
-
-    def initialize_composition(self, corners, sizes):
-        if self.timelapser.do_timelapse:
-            self.timelapser.initialize(corners, sizes)
-        else:
-            self.blender.prepare(corners, sizes)
-
-    def blend_images(self, imgs, masks, corners):
-        for idx, (img, mask, corner) in enumerate(zip(imgs, masks, corners)):
-            if self.timelapser.do_timelapse:
-                self.timelapser.process_and_save_frame(
-                    self.img_handler.img_names[idx], img, corner
-                    )
-            else:
-                self.blender.feed(img, mask, corner)
-
-    def create_final_panorama(self):
-        if not self.timelapser.do_timelapse:
-            panorama, _ = self.blender.blend()
-            return panorama
-
-    @staticmethod
-    def validate_kwargs(kwargs):
-        for arg in kwargs:
-            if arg not in Stitcher.DEFAULT_SETTINGS:
-                raise StitchingError("Invalid Argument: " + arg)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/stitching_error.py b/apps/opencv_stitching_tool/opencv_stitching/stitching_error.py
deleted file mode 100644
index 6d42e95437..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/stitching_error.py
+++ /dev/null
@@ -1,2 +0,0 @@
-class StitchingError(Exception):
-    pass
diff --git a/apps/opencv_stitching_tool/opencv_stitching/subsetter.py b/apps/opencv_stitching_tool/opencv_stitching/subsetter.py
deleted file mode 100644
index e037984530..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/subsetter.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from itertools import chain
-import math
-import cv2 as cv
-import numpy as np
-
-from .feature_matcher import FeatureMatcher
-from .stitching_error import StitchingError
-
-
-class Subsetter:
-
-    DEFAULT_CONFIDENCE_THRESHOLD = 1
-    DEFAULT_MATCHES_GRAPH_DOT_FILE = None
-
-    def __init__(self,
-                 confidence_threshold=DEFAULT_CONFIDENCE_THRESHOLD,
-                 matches_graph_dot_file=DEFAULT_MATCHES_GRAPH_DOT_FILE):
-        self.confidence_threshold = confidence_threshold
-        self.save_file = matches_graph_dot_file
-
-    def subset(self, img_names, img_sizes, imgs, features, matches):
-        self.save_matches_graph_dot_file(img_names, matches)
-        indices = self.get_indices_to_keep(features, matches)
-
-        img_names = Subsetter.subset_list(img_names, indices)
-        img_sizes = Subsetter.subset_list(img_sizes, indices)
-        imgs = Subsetter.subset_list(imgs, indices)
-        features = Subsetter.subset_list(features, indices)
-        matches = Subsetter.subset_matches(matches, indices)
-        return img_names, img_sizes, imgs, features, matches
-
-    def save_matches_graph_dot_file(self, img_names, pairwise_matches):
-        if self.save_file:
-            with open(self.save_file, 'w') as filehandler:
-                filehandler.write(self.get_matches_graph(img_names,
-                                                         pairwise_matches)
-                                  )
-
-    def get_matches_graph(self, img_names, pairwise_matches):
-        return cv.detail.matchesGraphAsString(img_names, pairwise_matches,
-                                              self.confidence_threshold)
-
-    def get_indices_to_keep(self, features, pairwise_matches):
-        indices = cv.detail.leaveBiggestComponent(features,
-                                                  pairwise_matches,
-                                                  self.confidence_threshold)
-
-        if len(indices) < 2:
-            raise StitchingError("No match exceeds the "
-                                 "given confidence theshold.")
-
-        return indices
-
-    @staticmethod
-    def subset_list(list_to_subset, indices):
-        return [list_to_subset[i] for i in indices]
-
-    @staticmethod
-    def subset_matches(pairwise_matches, indices):
-        indices_to_delete = Subsetter.get_indices_to_delete(
-            math.sqrt(len(pairwise_matches)),
-            indices
-            )
-
-        matches_matrix = FeatureMatcher.get_matches_matrix(pairwise_matches)
-        matches_matrix_subset = Subsetter.subset_matrix(matches_matrix,
-                                                        indices_to_delete)
-        matches_subset = Subsetter.matrix_rows_to_list(matches_matrix_subset)
-
-        return matches_subset
-
-    @staticmethod
-    def get_indices_to_delete(nr_elements, indices_to_keep):
-        return list(set(range(int(nr_elements))) - set(indices_to_keep))
-
-    @staticmethod
-    def subset_matrix(matrix_to_subset, indices_to_delete):
-        for idx, idx_to_delete in enumerate(indices_to_delete):
-            matrix_to_subset = Subsetter.delete_index_from_matrix(
-                matrix_to_subset,
-                idx_to_delete-idx  # matrix shape reduced by one at each step
-                )
-
-        return matrix_to_subset
-
-    @staticmethod
-    def delete_index_from_matrix(matrix, idx):
-        mask = np.ones(matrix.shape[0], bool)
-        mask[idx] = 0
-        return matrix[mask, :][:, mask]
-
-    @staticmethod
-    def matrix_rows_to_list(matrix):
-        return list(chain.from_iterable(matrix.tolist()))
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/.gitignore b/apps/opencv_stitching_tool/opencv_stitching/test/.gitignore
deleted file mode 100644
index 93426ff2b0..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/.gitignore
+++ /dev/null
@@ -1,13 +0,0 @@
-# Ignore everything
-*
-
-# But not these files...
-!.gitignore
-!test_matcher.py
-!test_stitcher.py
-!test_megapix_scaler.py
-!test_registration.py
-!test_composition.py
-!test_performance.py
-!stitching_detailed.py
-!SAMPLE_IMAGES_TO_DOWNLOAD.txt
\ No newline at end of file
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/SAMPLE_IMAGES_TO_DOWNLOAD.txt b/apps/opencv_stitching_tool/opencv_stitching/test/SAMPLE_IMAGES_TO_DOWNLOAD.txt
deleted file mode 100644
index 236d3607de..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/SAMPLE_IMAGES_TO_DOWNLOAD.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-https://github.com/opencv/opencv_extra/tree/4.x/testdata/stitching
-
-s1.jpg s2.jpg
-boat1.jpg boat2.jpg boat3.jpg boat4.jpg boat5.jpg boat6.jpg
-budapest1.jpg budapest2.jpg budapest3.jpg budapest4.jpg budapest5.jpg budapest6.jpg
\ No newline at end of file
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/stitching_detailed.py b/apps/opencv_stitching_tool/opencv_stitching/test/stitching_detailed.py
deleted file mode 100644
index ef9d78fe73..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/stitching_detailed.py
+++ /dev/null
@@ -1,406 +0,0 @@
-"""
-Stitching sample (advanced)
-===========================
-Show how to use Stitcher API from python.
-"""
-
-# Python 2/3 compatibility
-from __future__ import print_function
-
-from types import SimpleNamespace
-from collections import OrderedDict
-
-import cv2 as cv
-import numpy as np
-
-EXPOS_COMP_CHOICES = OrderedDict()
-EXPOS_COMP_CHOICES['gain_blocks'] = cv.detail.ExposureCompensator_GAIN_BLOCKS
-EXPOS_COMP_CHOICES['gain'] = cv.detail.ExposureCompensator_GAIN
-EXPOS_COMP_CHOICES['channel'] = cv.detail.ExposureCompensator_CHANNELS
-EXPOS_COMP_CHOICES['channel_blocks'] = cv.detail.ExposureCompensator_CHANNELS_BLOCKS
-EXPOS_COMP_CHOICES['no'] = cv.detail.ExposureCompensator_NO
-
-BA_COST_CHOICES = OrderedDict()
-BA_COST_CHOICES['ray'] = cv.detail_BundleAdjusterRay
-BA_COST_CHOICES['reproj'] = cv.detail_BundleAdjusterReproj
-BA_COST_CHOICES['affine'] = cv.detail_BundleAdjusterAffinePartial
-BA_COST_CHOICES['no'] = cv.detail_NoBundleAdjuster
-
-FEATURES_FIND_CHOICES = OrderedDict()
-try:
-    cv.xfeatures2d_SURF.create() # check if the function can be called
-    FEATURES_FIND_CHOICES['surf'] = cv.xfeatures2d_SURF.create
-except (AttributeError, cv.error) as e:
-    print("SURF not available")
-# if SURF not available, ORB is default
-FEATURES_FIND_CHOICES['orb'] = cv.ORB.create
-try:
-    FEATURES_FIND_CHOICES['sift'] = cv.xfeatures2d_SIFT.create
-except AttributeError:
-    print("SIFT not available")
-try:
-    FEATURES_FIND_CHOICES['brisk'] = cv.BRISK_create
-except AttributeError:
-    print("BRISK not available")
-try:
-    FEATURES_FIND_CHOICES['akaze'] = cv.AKAZE_create
-except AttributeError:
-    print("AKAZE not available")
-
-SEAM_FIND_CHOICES = OrderedDict()
-SEAM_FIND_CHOICES['dp_color'] = cv.detail_DpSeamFinder('COLOR')
-SEAM_FIND_CHOICES['dp_colorgrad'] = cv.detail_DpSeamFinder('COLOR_GRAD')
-SEAM_FIND_CHOICES['voronoi'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_VORONOI_SEAM)
-SEAM_FIND_CHOICES['no'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_NO)
-
-ESTIMATOR_CHOICES = OrderedDict()
-ESTIMATOR_CHOICES['homography'] = cv.detail_HomographyBasedEstimator
-ESTIMATOR_CHOICES['affine'] = cv.detail_AffineBasedEstimator
-
-WARP_CHOICES = (
-    'spherical',
-    'plane',
-    'affine',
-    'cylindrical',
-    'fisheye',
-    'stereographic',
-    'compressedPlaneA2B1',
-    'compressedPlaneA1.5B1',
-    'compressedPlanePortraitA2B1',
-    'compressedPlanePortraitA1.5B1',
-    'paniniA2B1',
-    'paniniA1.5B1',
-    'paniniPortraitA2B1',
-    'paniniPortraitA1.5B1',
-    'mercator',
-    'transverseMercator',
-)
-
-WAVE_CORRECT_CHOICES = OrderedDict()
-WAVE_CORRECT_CHOICES['horiz'] = cv.detail.WAVE_CORRECT_HORIZ
-WAVE_CORRECT_CHOICES['no'] = None
-WAVE_CORRECT_CHOICES['vert'] = cv.detail.WAVE_CORRECT_VERT
-
-BLEND_CHOICES = ('multiband', 'feather', 'no',)
-
-def get_matcher(args):
-    try_cuda = args.try_cuda
-    matcher_type = args.matcher
-    if args.match_conf is None:
-        if args.features == 'orb':
-            match_conf = 0.3
-        else:
-            match_conf = 0.65
-    else:
-        match_conf = args.match_conf
-    range_width = args.rangewidth
-    if matcher_type == "affine":
-        matcher = cv.detail_AffineBestOf2NearestMatcher(False, try_cuda, match_conf)
-    elif range_width == -1:
-        matcher = cv.detail.BestOf2NearestMatcher_create(try_cuda, match_conf)
-    else:
-        matcher = cv.detail.BestOf2NearestRangeMatcher_create(range_width, try_cuda, match_conf)
-    return matcher
-
-
-def get_compensator(args):
-    expos_comp_type = EXPOS_COMP_CHOICES[args.expos_comp]
-    expos_comp_nr_feeds = args.expos_comp_nr_feeds
-    expos_comp_block_size = args.expos_comp_block_size
-    # expos_comp_nr_filtering = args.expos_comp_nr_filtering
-    if expos_comp_type == cv.detail.ExposureCompensator_CHANNELS:
-        compensator = cv.detail_ChannelsCompensator(expos_comp_nr_feeds)
-        # compensator.setNrGainsFilteringIterations(expos_comp_nr_filtering)
-    elif expos_comp_type == cv.detail.ExposureCompensator_CHANNELS_BLOCKS:
-        compensator = cv.detail_BlocksChannelsCompensator(
-            expos_comp_block_size, expos_comp_block_size,
-            expos_comp_nr_feeds
-        )
-        # compensator.setNrGainsFilteringIterations(expos_comp_nr_filtering)
-    else:
-        compensator = cv.detail.ExposureCompensator_createDefault(expos_comp_type)
-    return compensator
-
-
-def main():
-
-    args = {
-        "img_names":["boat5.jpg", "boat2.jpg",
-                     "boat3.jpg", "boat4.jpg",
-                     "boat1.jpg", "boat6.jpg"],
-        "try_cuda": False,
-        "work_megapix": 0.6,
-        "features": "orb",
-        "matcher": "homography",
-        "estimator": "homography",
-        "match_conf": None,
-        "conf_thresh": 1.0,
-        "ba": "ray",
-        "ba_refine_mask": "xxxxx",
-        "wave_correct": "horiz",
-        "save_graph": None,
-        "warp": "spherical",
-        "seam_megapix": 0.1,
-        "seam": "dp_color",
-        "compose_megapix": 3,
-        "expos_comp": "gain_blocks",
-        "expos_comp_nr_feeds": 1,
-        "expos_comp_nr_filtering": 2,
-        "expos_comp_block_size": 32,
-        "blend": "multiband",
-        "blend_strength": 5,
-        "output": "time_test.jpg",
-        "timelapse": None,
-        "rangewidth": -1
-    }
-
-    args = SimpleNamespace(**args)
-    img_names = args.img_names
-    work_megapix = args.work_megapix
-    seam_megapix = args.seam_megapix
-    compose_megapix = args.compose_megapix
-    conf_thresh = args.conf_thresh
-    ba_refine_mask = args.ba_refine_mask
-    wave_correct = WAVE_CORRECT_CHOICES[args.wave_correct]
-    if args.save_graph is None:
-        save_graph = False
-    else:
-        save_graph = True
-    warp_type = args.warp
-    blend_type = args.blend
-    blend_strength = args.blend_strength
-    result_name = args.output
-    if args.timelapse is not None:
-        timelapse = True
-        if args.timelapse == "as_is":
-            timelapse_type = cv.detail.Timelapser_AS_IS
-        elif args.timelapse == "crop":
-            timelapse_type = cv.detail.Timelapser_CROP
-        else:
-            print("Bad timelapse method")
-            exit()
-    else:
-        timelapse = False
-    finder = FEATURES_FIND_CHOICES[args.features]()
-    seam_work_aspect = 1
-    full_img_sizes = []
-    features = []
-    images = []
-    is_work_scale_set = False
-    is_seam_scale_set = False
-    is_compose_scale_set = False
-    for name in img_names:
-        full_img = cv.imread(cv.samples.findFile(name))
-        if full_img is None:
-            print("Cannot read image ", name)
-            exit()
-        full_img_sizes.append((full_img.shape[1], full_img.shape[0]))
-        if work_megapix < 0:
-            img = full_img
-            work_scale = 1
-            is_work_scale_set = True
-        else:
-            if is_work_scale_set is False:
-                work_scale = min(1.0, np.sqrt(work_megapix * 1e6 / (full_img.shape[0] * full_img.shape[1])))
-                is_work_scale_set = True
-            img = cv.resize(src=full_img, dsize=None, fx=work_scale, fy=work_scale, interpolation=cv.INTER_LINEAR_EXACT)
-        if is_seam_scale_set is False:
-            seam_scale = min(1.0, np.sqrt(seam_megapix * 1e6 / (full_img.shape[0] * full_img.shape[1])))
-            seam_work_aspect = seam_scale / work_scale
-            is_seam_scale_set = True
-        img_feat = cv.detail.computeImageFeatures2(finder, img)
-        features.append(img_feat)
-        img = cv.resize(src=full_img, dsize=None, fx=seam_scale, fy=seam_scale, interpolation=cv.INTER_LINEAR_EXACT)
-        images.append(img)
-
-    matcher = get_matcher(args)
-    p = matcher.apply2(features)
-    matcher.collectGarbage()
-
-    if save_graph:
-        with open(args.save_graph, 'w') as fh:
-            fh.write(cv.detail.matchesGraphAsString(img_names, p, conf_thresh))
-
-    indices = cv.detail.leaveBiggestComponent(features, p, conf_thresh)
-    img_subset = []
-    img_names_subset = []
-    full_img_sizes_subset = []
-    for i in range(len(indices)):
-        img_names_subset.append(img_names[indices[i, 0]])
-        img_subset.append(images[indices[i, 0]])
-        full_img_sizes_subset.append(full_img_sizes[indices[i, 0]])
-    images = img_subset
-    img_names = img_names_subset
-    full_img_sizes = full_img_sizes_subset
-    num_images = len(img_names)
-    if num_images < 2:
-        print("Need more images")
-        exit()
-
-    estimator = ESTIMATOR_CHOICES[args.estimator]()
-    b, cameras = estimator.apply(features, p, None)
-    if not b:
-        print("Homography estimation failed.")
-        exit()
-    for cam in cameras:
-        cam.R = cam.R.astype(np.float32)
-
-    adjuster = BA_COST_CHOICES[args.ba]()
-    adjuster.setConfThresh(1)
-    refine_mask = np.zeros((3, 3), np.uint8)
-    if ba_refine_mask[0] == 'x':
-        refine_mask[0, 0] = 1
-    if ba_refine_mask[1] == 'x':
-        refine_mask[0, 1] = 1
-    if ba_refine_mask[2] == 'x':
-        refine_mask[0, 2] = 1
-    if ba_refine_mask[3] == 'x':
-        refine_mask[1, 1] = 1
-    if ba_refine_mask[4] == 'x':
-        refine_mask[1, 2] = 1
-    adjuster.setRefinementMask(refine_mask)
-    b, cameras = adjuster.apply(features, p, cameras)
-    if not b:
-        print("Camera parameters adjusting failed.")
-        exit()
-    focals = []
-    for cam in cameras:
-        focals.append(cam.focal)
-    focals.sort()
-    if len(focals) % 2 == 1:
-        warped_image_scale = focals[len(focals) // 2]
-    else:
-        warped_image_scale = (focals[len(focals) // 2] + focals[len(focals) // 2 - 1]) / 2
-    if wave_correct is not None:
-        rmats = []
-        for cam in cameras:
-            rmats.append(np.copy(cam.R))
-        rmats = cv.detail.waveCorrect(rmats, wave_correct)
-        for idx, cam in enumerate(cameras):
-            cam.R = rmats[idx]
-    corners = []
-    masks_warped = []
-    images_warped = []
-    sizes = []
-    masks = []
-    for i in range(0, num_images):
-        um = cv.UMat(255 * np.ones((images[i].shape[0], images[i].shape[1]), np.uint8))
-        masks.append(um)
-
-    warper = cv.PyRotationWarper(warp_type, warped_image_scale * seam_work_aspect)  # warper could be nullptr?
-    for idx in range(0, num_images):
-        K = cameras[idx].K().astype(np.float32)
-        swa = seam_work_aspect
-        K[0, 0] *= swa
-        K[0, 2] *= swa
-        K[1, 1] *= swa
-        K[1, 2] *= swa
-        corner, image_wp = warper.warp(images[idx], K, cameras[idx].R, cv.INTER_LINEAR, cv.BORDER_REFLECT)
-        corners.append(corner)
-        sizes.append((image_wp.shape[1], image_wp.shape[0]))
-        images_warped.append(image_wp)
-        p, mask_wp = warper.warp(masks[idx], K, cameras[idx].R, cv.INTER_NEAREST, cv.BORDER_CONSTANT)
-        masks_warped.append(mask_wp.get())
-
-    images_warped_f = []
-    for img in images_warped:
-        imgf = img.astype(np.float32)
-        images_warped_f.append(imgf)
-
-    compensator = get_compensator(args)
-    compensator.feed(corners=corners, images=images_warped, masks=masks_warped)
-
-    seam_finder = SEAM_FIND_CHOICES[args.seam]
-    masks_warped = seam_finder.find(images_warped_f, corners, masks_warped)
-    compose_scale = 1
-    corners = []
-    sizes = []
-    blender = None
-    timelapser = None
-    # https://github.com/opencv/opencv/blob/4.x/samples/cpp/stitching_detailed.cpp#L725 ?
-    for idx, name in enumerate(img_names):
-        full_img = cv.imread(name)
-        if not is_compose_scale_set:
-            if compose_megapix > 0:
-                compose_scale = min(1.0, np.sqrt(compose_megapix * 1e6 / (full_img.shape[0] * full_img.shape[1])))
-            is_compose_scale_set = True
-            compose_work_aspect = compose_scale / work_scale
-            warped_image_scale *= compose_work_aspect
-            warper = cv.PyRotationWarper(warp_type, warped_image_scale)
-            for i in range(0, len(img_names)):
-                cameras[i].focal *= compose_work_aspect
-                cameras[i].ppx *= compose_work_aspect
-                cameras[i].ppy *= compose_work_aspect
-                sz = (int(round(full_img_sizes[i][0] * compose_scale)),
-                      int(round(full_img_sizes[i][1] * compose_scale)))
-                K = cameras[i].K().astype(np.float32)
-                roi = warper.warpRoi(sz, K, cameras[i].R)
-                corners.append(roi[0:2])
-                sizes.append(roi[2:4])
-        if abs(compose_scale - 1) > 1e-1:
-            img = cv.resize(src=full_img, dsize=None, fx=compose_scale, fy=compose_scale,
-                            interpolation=cv.INTER_LINEAR_EXACT)
-        else:
-            img = full_img
-        _img_size = (img.shape[1], img.shape[0])
-        K = cameras[idx].K().astype(np.float32)
-        corner, image_warped = warper.warp(img, K, cameras[idx].R, cv.INTER_LINEAR, cv.BORDER_REFLECT)
-        mask = 255 * np.ones((img.shape[0], img.shape[1]), np.uint8)
-        p, mask_warped = warper.warp(mask, K, cameras[idx].R, cv.INTER_NEAREST, cv.BORDER_CONSTANT)
-        compensator.apply(idx, corners[idx], image_warped, mask_warped)
-        image_warped_s = image_warped.astype(np.int16)
-        dilated_mask = cv.dilate(masks_warped[idx], None)
-        seam_mask = cv.resize(dilated_mask, (mask_warped.shape[1], mask_warped.shape[0]), 0, 0, cv.INTER_LINEAR_EXACT)
-        mask_warped = cv.bitwise_and(seam_mask, mask_warped)
-        if blender is None and not timelapse:
-            blender = cv.detail.Blender_createDefault(cv.detail.Blender_NO)
-            dst_sz = cv.detail.resultRoi(corners=corners, sizes=sizes)
-            blend_width = np.sqrt(dst_sz[2] * dst_sz[3]) * blend_strength / 100
-            if blend_width < 1:
-                blender = cv.detail.Blender_createDefault(cv.detail.Blender_NO)
-            elif blend_type == "multiband":
-                blender = cv.detail_MultiBandBlender()
-                blender.setNumBands((np.log(blend_width) / np.log(2.) - 1.).astype(np.int64))
-            elif blend_type == "feather":
-                blender = cv.detail_FeatherBlender()
-                blender.setSharpness(1. / blend_width)
-            blender.prepare(dst_sz)
-        elif timelapser is None and timelapse:
-            timelapser = cv.detail.Timelapser_createDefault(timelapse_type)
-            timelapser.initialize(corners, sizes)
-        if timelapse:
-            ma_tones = np.ones((image_warped_s.shape[0], image_warped_s.shape[1]), np.uint8)
-            timelapser.process(image_warped_s, ma_tones, corners[idx])
-            pos_s = img_names[idx].rfind("/")
-            if pos_s == -1:
-                fixed_file_name = "fixed_" + img_names[idx]
-            else:
-                fixed_file_name = img_names[idx][:pos_s + 1] + "fixed_" + img_names[idx][pos_s + 1:]
-            cv.imwrite(fixed_file_name, timelapser.getDst())
-        else:
-            blender.feed(cv.UMat(image_warped_s), mask_warped, corners[idx])
-    if not timelapse:
-        result = None
-        result_mask = None
-        result, result_mask = blender.blend(result, result_mask)
-        # cv.imwrite(result_name, result)
-        return result
-        # zoom_x = 600.0 / result.shape[1]
-        # dst = cv.normalize(src=result, dst=None, alpha=255., norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
-        # dst = cv.resize(dst, dsize=None, fx=zoom_x, fy=zoom_x)
-        # cv.imshow(result_name, dst)
-        # cv.waitKey()
-
-
-
-if __name__ == '__main__':
-    import tracemalloc
-    import time
-    tracemalloc.start()
-    start = time.time()
-    result = main()
-    current, peak = tracemalloc.get_traced_memory()
-    print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
-    tracemalloc.stop()
-    end = time.time()
-    print(end - start)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_composition.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_composition.py
deleted file mode 100644
index b0b4d76c87..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_composition.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import unittest
-import os
-import sys
-
-import numpy as np
-import cv2 as cv
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.stitcher import Stitcher
-
-
-class TestImageComposition(unittest.TestCase):
-
-    #  visual test: look especially in the sky
-    def test_exposure_compensation(self):
-        img = cv.imread("s1.jpg")
-        img = increase_brightness(img, value=25)
-        cv.imwrite("s1_bright.jpg", img)
-
-        stitcher = Stitcher(compensator="no", blender_type="no")
-        result = stitcher.stitch(["s1_bright.jpg", "s2.jpg"])
-
-        cv.imwrite("without_exposure_comp.jpg", result)
-
-        stitcher = Stitcher(blender_type="no")
-        result = stitcher.stitch(["s1_bright.jpg", "s2.jpg"])
-
-        cv.imwrite("with_exposure_comp.jpg", result)
-
-    def test_timelapse(self):
-        stitcher = Stitcher(timelapse='as_is')
-        _ = stitcher.stitch(["s1.jpg", "s2.jpg"])
-        frame1 = cv.imread("fixed_s1.jpg")
-
-        max_image_shape_derivation = 3
-        np.testing.assert_allclose(frame1.shape[:2],
-                                   (700, 1811),
-                                   atol=max_image_shape_derivation)
-
-        left = cv.cvtColor(frame1[:, :1300, ], cv.COLOR_BGR2GRAY)
-        right = cv.cvtColor(frame1[:, 1300:, ], cv.COLOR_BGR2GRAY)
-
-        self.assertGreater(cv.countNonZero(left), 800000)
-        self.assertEqual(cv.countNonZero(right), 0)
-
-
-def increase_brightness(img, value=30):
-    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
-    h, s, v = cv.split(hsv)
-
-    lim = 255 - value
-    v[v > lim] = 255
-    v[v <= lim] += value
-
-    final_hsv = cv.merge((h, s, v))
-    img = cv.cvtColor(final_hsv, cv.COLOR_HSV2BGR)
-    return img
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_matcher.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_matcher.py
deleted file mode 100644
index a2424ec9ce..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_matcher.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import unittest
-import os
-import sys
-
-import numpy as np
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.feature_matcher import FeatureMatcher
-# %%
-
-
-class TestMatcher(unittest.TestCase):
-
-    def test_array_in_sqare_matrix(self):
-        array = np.zeros(9)
-
-        matrix = FeatureMatcher.array_in_sqare_matrix(array)
-
-        np.testing.assert_array_equal(matrix, np.array([[0., 0., 0.],
-                                                        [0., 0., 0.],
-                                                        [0., 0., 0.]]))
-
-    def test_get_all_img_combinations(self):
-        nimgs = 3
-
-        combinations = list(FeatureMatcher.get_all_img_combinations(nimgs))
-
-        self.assertEqual(combinations, [(0, 1), (0, 2), (1, 2)])
-
-    def test_get_match_conf(self):
-        explicit_match_conf = FeatureMatcher.get_match_conf(1, 'orb')
-        implicit_match_conf_orb = FeatureMatcher.get_match_conf(None, 'orb')
-        implicit_match_conf_other = FeatureMatcher.get_match_conf(None, 'surf')
-
-        self.assertEqual(explicit_match_conf, 1)
-        self.assertEqual(implicit_match_conf_orb, 0.3)
-        self.assertEqual(implicit_match_conf_other, 0.65)
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_megapix_scaler.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_megapix_scaler.py
deleted file mode 100644
index 0dc5b8fbbf..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_megapix_scaler.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import unittest
-import os
-import sys
-
-import cv2 as cv
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.megapix_scaler import MegapixScaler, MegapixDownscaler
-# %%
-
-
-class TestScaler(unittest.TestCase):
-
-    def setUp(self):
-        self.img = cv.imread("s1.jpg")
-        self.size = (self.img.shape[1], self.img.shape[0])
-
-    def test_get_scale_by_resolution(self):
-        scaler = MegapixScaler(0.6)
-
-        scale = scaler.get_scale_by_resolution(1_200_000)
-
-        self.assertEqual(scale, 0.7071067811865476)
-
-    def test_get_scale_by_image(self):
-        scaler = MegapixScaler(0.6)
-
-        scaler.set_scale_by_img_size(self.size)
-
-        self.assertEqual(scaler.scale, 0.8294067854101966)
-
-    def test_get_scaled_img_size(self):
-        scaler = MegapixScaler(0.6)
-        scaler.set_scale_by_img_size(self.size)
-
-        size = scaler.get_scaled_img_size(self.size)
-        self.assertEqual(size, (1033, 581))
-        # 581*1033 = 600173 px = ~0.6 MP
-
-    def test_force_of_downscaling(self):
-        normal_scaler = MegapixScaler(2)
-        downscaler = MegapixDownscaler(2)
-
-        normal_scaler.set_scale_by_img_size(self.size)
-        downscaler.set_scale_by_img_size(self.size)
-
-        self.assertEqual(normal_scaler.scale, 1.5142826857233715)
-        self.assertEqual(downscaler.scale, 1.0)
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_performance.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_performance.py
deleted file mode 100644
index 2028ed8b5c..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_performance.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import unittest
-import os
-import sys
-import time
-import tracemalloc
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.stitcher import Stitcher
-from stitching_detailed import main
-# %%
-
-
-class TestStitcher(unittest.TestCase):
-
-    @unittest.skip("skip performance test (not needed in every run)")
-    def test_performance(self):
-
-        print("Run new Stitcher class:")
-
-        start = time.time()
-        tracemalloc.start()
-
-        stitcher = Stitcher(final_megapix=3)
-        stitcher.stitch(["boat5.jpg", "boat2.jpg",
-                         "boat3.jpg", "boat4.jpg",
-                         "boat1.jpg", "boat6.jpg"])
-
-        _, peak_memory = tracemalloc.get_traced_memory()
-        tracemalloc.stop()
-        end = time.time()
-        time_needed = end - start
-
-        print(f"Peak was {peak_memory / 10**6} MB")
-        print(f"Time was {time_needed} s")
-
-        print("Run original stitching_detailed.py:")
-
-        start = time.time()
-        tracemalloc.start()
-
-        main()
-
-        _, peak_memory_detailed = tracemalloc.get_traced_memory()
-        tracemalloc.stop()
-        end = time.time()
-        time_needed_detailed = end - start
-
-        print(f"Peak was {peak_memory_detailed / 10**6} MB")
-        print(f"Time was {time_needed_detailed} s")
-
-        self.assertLessEqual(peak_memory / 10**6,
-                             peak_memory_detailed / 10**6)
-        uncertainty_based_on_run = 0.25
-        self.assertLessEqual(time_needed - uncertainty_based_on_run,
-                             time_needed_detailed)
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_registration.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_registration.py
deleted file mode 100644
index 15b851e433..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_registration.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import unittest
-import os
-import sys
-
-import numpy as np
-import cv2 as cv
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.feature_detector import FeatureDetector
-from opencv_stitching.feature_matcher import FeatureMatcher
-from opencv_stitching.subsetter import Subsetter
-
-
-class TestImageRegistration(unittest.TestCase):
-
-    def test_feature_detector(self):
-        img1 = cv.imread("s1.jpg")
-
-        default_number_of_keypoints = 500
-        detector = FeatureDetector("orb")
-        features = detector.detect_features(img1)
-        self.assertEqual(len(features.getKeypoints()),
-                         default_number_of_keypoints)
-
-        other_keypoints = 1000
-        detector = FeatureDetector("orb", nfeatures=other_keypoints)
-        features = detector.detect_features(img1)
-        self.assertEqual(len(features.getKeypoints()), other_keypoints)
-
-    def test_feature_matcher(self):
-        img1, img2 = cv.imread("s1.jpg"), cv.imread("s2.jpg")
-
-        detector = FeatureDetector("orb")
-        features = [detector.detect_features(img1),
-                    detector.detect_features(img2)]
-
-        matcher = FeatureMatcher()
-        pairwise_matches = matcher.match_features(features)
-        self.assertEqual(len(pairwise_matches), len(features)**2)
-        self.assertGreater(pairwise_matches[1].confidence, 2)
-
-        matches_matrix = FeatureMatcher.get_matches_matrix(pairwise_matches)
-        self.assertEqual(matches_matrix.shape, (2, 2))
-        conf_matrix = FeatureMatcher.get_confidence_matrix(pairwise_matches)
-        self.assertTrue(np.array_equal(
-            conf_matrix > 2,
-            np.array([[False,  True], [True, False]])
-            ))
-
-    def test_subsetting(self):
-        img1, img2 = cv.imread("s1.jpg"), cv.imread("s2.jpg")
-        img3, img4 = cv.imread("boat1.jpg"), cv.imread("boat2.jpg")
-        img5 = cv.imread("boat3.jpg")
-        img_names = ["s1.jpg", "s2.jpg", "boat1.jpg", "boat2.jpg", "boat3.jpg"]
-
-        detector = FeatureDetector("orb")
-        features = [detector.detect_features(img1),
-                    detector.detect_features(img2),
-                    detector.detect_features(img3),
-                    detector.detect_features(img4),
-                    detector.detect_features(img5)]
-        matcher = FeatureMatcher()
-        pairwise_matches = matcher.match_features(features)
-        subsetter = Subsetter(confidence_threshold=1,
-                              matches_graph_dot_file="dot_graph.txt")  # view in https://dreampuf.github.io  # noqa
-
-        indices = subsetter.get_indices_to_keep(features, pairwise_matches)
-        indices_to_delete = subsetter.get_indices_to_delete(len(img_names),
-                                                            indices)
-
-        np.testing.assert_array_equal(indices, np.array([2, 3, 4]))
-        np.testing.assert_array_equal(indices_to_delete, np.array([0, 1]))
-
-        subsetted_image_names = subsetter.subset_list(img_names, indices)
-        self.assertEqual(subsetted_image_names,
-                         ['boat1.jpg', 'boat2.jpg', 'boat3.jpg'])
-
-        matches_subset = subsetter.subset_matches(pairwise_matches, indices)
-        # FeatureMatcher.get_confidence_matrix(pairwise_matches)
-        # FeatureMatcher.get_confidence_matrix(subsetted_matches)
-        self.assertEqual(pairwise_matches[13].confidence,
-                         matches_subset[1].confidence)
-
-        graph = subsetter.get_matches_graph(img_names, pairwise_matches)
-        self.assertTrue(graph.startswith("graph matches_graph{"))
-
-        subsetter.save_matches_graph_dot_file(img_names, pairwise_matches)
-        with open('dot_graph.txt', 'r') as file:
-            graph = file.read()
-            self.assertTrue(graph.startswith("graph matches_graph{"))
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/test/test_stitcher.py b/apps/opencv_stitching_tool/opencv_stitching/test/test_stitcher.py
deleted file mode 100644
index d97300dadd..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/test/test_stitcher.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import unittest
-import os
-import sys
-
-import numpy as np
-import cv2 as cv
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                                '..', '..')))
-
-from opencv_stitching.stitcher import Stitcher
-# %%
-
-
-class TestStitcher(unittest.TestCase):
-
-    def test_stitcher_aquaduct(self):
-        stitcher = Stitcher(nfeatures=250)
-        result = stitcher.stitch(["s1.jpg", "s2.jpg"])
-        cv.imwrite("result.jpg", result)
-
-        max_image_shape_derivation = 3
-        np.testing.assert_allclose(result.shape[:2],
-                                   (700, 1811),
-                                   atol=max_image_shape_derivation)
-
-    @unittest.skip("skip boat test (high resuolution ran >30s)")
-    def test_stitcher_boat1(self):
-        settings = {"warper_type": "fisheye",
-                    "wave_correct_kind": "no",
-                    "finder": "dp_colorgrad",
-                    "compensator": "no",
-                    "confidence_threshold": 0.3}
-
-        stitcher = Stitcher(**settings)
-        result = stitcher.stitch(["boat5.jpg", "boat2.jpg",
-                                  "boat3.jpg", "boat4.jpg",
-                                  "boat1.jpg", "boat6.jpg"])
-
-        cv.imwrite("boat_fisheye.jpg", result)
-
-        max_image_shape_derivation = 600
-        np.testing.assert_allclose(result.shape[:2],
-                                   (14488,  7556),
-                                   atol=max_image_shape_derivation)
-
-    @unittest.skip("skip boat test (high resuolution ran >30s)")
-    def test_stitcher_boat2(self):
-        settings = {"warper_type": "compressedPlaneA2B1",
-                    "finder": "dp_colorgrad",
-                    "compensator": "channel_blocks",
-                    "confidence_threshold": 0.3}
-
-        stitcher = Stitcher(**settings)
-        result = stitcher.stitch(["boat5.jpg", "boat2.jpg",
-                                  "boat3.jpg", "boat4.jpg",
-                                  "boat1.jpg", "boat6.jpg"])
-
-        cv.imwrite("boat_plane.jpg", result)
-
-        max_image_shape_derivation = 600
-        np.testing.assert_allclose(result.shape[:2],
-                                   (7400, 12340),
-                                   atol=max_image_shape_derivation)
-
-    def test_stitcher_boat_aquaduct_subset(self):
-        settings = {"final_megapix": 1, "crop": True}
-
-        stitcher = Stitcher(**settings)
-        result = stitcher.stitch(["boat5.jpg",
-                                  "s1.jpg", "s2.jpg",
-                                  "boat2.jpg",
-                                  "boat3.jpg", "boat4.jpg",
-                                  "boat1.jpg", "boat6.jpg"])
-        cv.imwrite("subset_low_res.jpg", result)
-
-        max_image_shape_derivation = 100
-        np.testing.assert_allclose(result.shape[:2],
-                                   (705, 3374),
-                                   atol=max_image_shape_derivation)
-
-    def test_stitcher_budapest(self):
-        settings = {"matcher_type": "affine",
-                    "estimator": "affine",
-                    "adjuster": "affine",
-                    "warper_type": "affine",
-                    "wave_correct_kind": "no",
-                    "confidence_threshold": 0.3}
-
-        stitcher = Stitcher(**settings)
-        result = stitcher.stitch(["budapest1.jpg", "budapest2.jpg",
-                                  "budapest3.jpg", "budapest4.jpg",
-                                  "budapest5.jpg", "budapest6.jpg"])
-
-        cv.imwrite("budapest.jpg", result)
-
-        max_image_shape_derivation = 50
-        np.testing.assert_allclose(result.shape[:2],
-                                   (1155, 2310),
-                                   atol=max_image_shape_derivation)
-
-
-def starttest():
-    unittest.main()
-
-
-if __name__ == "__main__":
-    starttest()
diff --git a/apps/opencv_stitching_tool/opencv_stitching/timelapser.py b/apps/opencv_stitching_tool/opencv_stitching/timelapser.py
deleted file mode 100644
index 894294bd42..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/timelapser.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-import cv2 as cv
-import numpy as np
-
-
-class Timelapser:
-
-    TIMELAPSE_CHOICES = ('no', 'as_is', 'crop',)
-    DEFAULT_TIMELAPSE = 'no'
-
-    def __init__(self, timelapse=DEFAULT_TIMELAPSE):
-        self.do_timelapse = True
-        self.timelapse_type = None
-        self.timelapser = None
-
-        if timelapse == "as_is":
-            self.timelapse_type = cv.detail.Timelapser_AS_IS
-        elif timelapse == "crop":
-            self.timelapse_type = cv.detail.Timelapser_CROP
-        else:
-            self.do_timelapse = False
-
-        if self.do_timelapse:
-            self.timelapser = cv.detail.Timelapser_createDefault(
-                self.timelapse_type
-                )
-
-    def initialize(self, *args):
-        """https://docs.opencv.org/4.x/dd/dac/classcv_1_1detail_1_1Timelapser.html#aaf0f7c4128009f02473332a0c41f6345"""  # noqa
-        self.timelapser.initialize(*args)
-
-    def process_and_save_frame(self, img_name, img, corner):
-        self.process_frame(img, corner)
-        cv.imwrite(self.get_fixed_filename(img_name), self.get_frame())
-
-    def process_frame(self, img, corner):
-        mask = np.ones((img.shape[0], img.shape[1]), np.uint8)
-        img = img.astype(np.int16)
-        self.timelapser.process(img, mask, corner)
-
-    def get_frame(self):
-        frame = self.timelapser.getDst()
-        frame = np.float32(cv.UMat.get(frame))
-        frame = cv.convertScaleAbs(frame)
-        return frame
-
-    @staticmethod
-    def get_fixed_filename(img_name):
-        dirname, filename = os.path.split(img_name)
-        return os.path.join(dirname, "fixed_" + filename)
diff --git a/apps/opencv_stitching_tool/opencv_stitching/warper.py b/apps/opencv_stitching_tool/opencv_stitching/warper.py
deleted file mode 100644
index 0d915e76ea..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching/warper.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from statistics import median
-
-import cv2 as cv
-import numpy as np
-
-
-class Warper:
-
-    WARP_TYPE_CHOICES = ('spherical', 'plane', 'affine', 'cylindrical',
-                         'fisheye', 'stereographic', 'compressedPlaneA2B1',
-                         'compressedPlaneA1.5B1',
-                         'compressedPlanePortraitA2B1',
-                         'compressedPlanePortraitA1.5B1',
-                         'paniniA2B1', 'paniniA1.5B1', 'paniniPortraitA2B1',
-                         'paniniPortraitA1.5B1', 'mercator',
-                         'transverseMercator')
-
-    DEFAULT_WARP_TYPE = 'spherical'
-
-    def __init__(self, warper_type=DEFAULT_WARP_TYPE):
-        self.warper_type = warper_type
-        self.scale = None
-
-    def set_scale(self, cameras):
-        focals = [cam.focal for cam in cameras]
-        self.scale = median(focals)
-
-    def warp_images(self, imgs, cameras, aspect=1):
-        for img, camera in zip(imgs, cameras):
-            yield self.warp_image(img, camera, aspect)
-
-    def warp_image(self, img, camera, aspect=1):
-        warper = cv.PyRotationWarper(self.warper_type, self.scale*aspect)
-        _, warped_image = warper.warp(img,
-                                      Warper.get_K(camera, aspect),
-                                      camera.R,
-                                      cv.INTER_LINEAR,
-                                      cv.BORDER_REFLECT)
-        return warped_image
-
-    def create_and_warp_masks(self, sizes, cameras, aspect=1):
-        for size, camera in zip(sizes, cameras):
-            yield self.create_and_warp_mask(size, camera, aspect)
-
-    def create_and_warp_mask(self, size, camera, aspect=1):
-        warper = cv.PyRotationWarper(self.warper_type, self.scale*aspect)
-        mask = 255 * np.ones((size[1], size[0]), np.uint8)
-        _, warped_mask = warper.warp(mask,
-                                     Warper.get_K(camera, aspect),
-                                     camera.R,
-                                     cv.INTER_NEAREST,
-                                     cv.BORDER_CONSTANT)
-        return warped_mask
-
-    def warp_rois(self, sizes, cameras, aspect=1):
-        roi_corners = []
-        roi_sizes = []
-        for size, camera in zip(sizes, cameras):
-            roi = self.warp_roi(size, camera, aspect)
-            roi_corners.append(roi[0:2])
-            roi_sizes.append(roi[2:4])
-        return roi_corners, roi_sizes
-
-    def warp_roi(self, size, camera, aspect=1):
-        warper = cv.PyRotationWarper(self.warper_type, self.scale*aspect)
-        K = Warper.get_K(camera, aspect)
-        return warper.warpRoi(size, K, camera.R)
-
-    @staticmethod
-    def get_K(camera, aspect=1):
-        K = camera.K().astype(np.float32)
-        """ Modification of intrinsic parameters needed if cameras were
-        obtained on different scale than the scale of the Images which should
-        be warped """
-        K[0, 0] *= aspect
-        K[0, 2] *= aspect
-        K[1, 1] *= aspect
-        K[1, 2] *= aspect
-        return K
diff --git a/apps/opencv_stitching_tool/opencv_stitching_tool.py b/apps/opencv_stitching_tool/opencv_stitching_tool.py
deleted file mode 100644
index 2e41c11b87..0000000000
--- a/apps/opencv_stitching_tool/opencv_stitching_tool.py
+++ /dev/null
@@ -1,238 +0,0 @@
-"""
-Stitching sample (advanced)
-===========================
-
-Show how to use Stitcher API from python.
-"""
-
-# Python 2/3 compatibility
-from __future__ import print_function
-
-import argparse
-
-import cv2 as cv
-import numpy as np
-
-from opencv_stitching.stitcher import Stitcher
-
-from opencv_stitching.image_handler import ImageHandler
-from opencv_stitching.feature_detector import FeatureDetector
-from opencv_stitching.feature_matcher import FeatureMatcher
-from opencv_stitching.subsetter import Subsetter
-from opencv_stitching.camera_estimator import CameraEstimator
-from opencv_stitching.camera_adjuster import CameraAdjuster
-from opencv_stitching.camera_wave_corrector import WaveCorrector
-from opencv_stitching.warper import Warper
-from opencv_stitching.cropper import Cropper
-from opencv_stitching.exposure_error_compensator import ExposureErrorCompensator  # noqa
-from opencv_stitching.seam_finder import SeamFinder
-from opencv_stitching.blender import Blender
-from opencv_stitching.timelapser import Timelapser
-
-parser = argparse.ArgumentParser(
-    prog="opencv_stitching_tool.py",
-    description="Rotation model images stitcher"
-)
-parser.add_argument(
-    'img_names', nargs='+',
-    help="Files to stitch", type=str
-)
-parser.add_argument(
-    '--medium_megapix', action='store',
-    default=ImageHandler.DEFAULT_MEDIUM_MEGAPIX,
-    help="Resolution for image registration step. "
-    "The default is %s Mpx" % ImageHandler.DEFAULT_MEDIUM_MEGAPIX,
-    type=float, dest='medium_megapix'
-)
-parser.add_argument(
-    '--detector', action='store',
-    default=FeatureDetector.DEFAULT_DETECTOR,
-    help="Type of features used for images matching. "
-         "The default is '%s'." % FeatureDetector.DEFAULT_DETECTOR,
-    choices=FeatureDetector.DETECTOR_CHOICES.keys(),
-    type=str, dest='detector'
-)
-parser.add_argument(
-    '--nfeatures', action='store',
-    default=500,
-    help="Type of features used for images matching. "
-         "The default is 500.",
-    type=int, dest='nfeatures'
-)
-parser.add_argument(
-    '--matcher_type', action='store', default=FeatureMatcher.DEFAULT_MATCHER,
-    help="Matcher used for pairwise image matching. "
-         "The default is '%s'." % FeatureMatcher.DEFAULT_MATCHER,
-    choices=FeatureMatcher.MATCHER_CHOICES,
-    type=str, dest='matcher_type'
-)
-parser.add_argument(
-    '--range_width', action='store',
-    default=FeatureMatcher.DEFAULT_RANGE_WIDTH,
-    help="uses range_width to limit number of images to match with.",
-    type=int, dest='range_width'
-)
-parser.add_argument(
-    '--try_use_gpu', action='store', default=False,
-    help="Try to use CUDA. The default value is no. "
-         "All default values are for CPU mode.",
-    type=bool, dest='try_use_gpu'
-)
-parser.add_argument(
-    '--match_conf', action='store',
-    help="Confidence for feature matching step. "
-         "The default is 0.3 for ORB and 0.65 for other feature types.",
-    type=float, dest='match_conf'
-)
-parser.add_argument(
-    '--confidence_threshold', action='store',
-    default=Subsetter.DEFAULT_CONFIDENCE_THRESHOLD,
-    help="Threshold for two images are from the same panorama confidence. "
-         "The default is '%s'." % Subsetter.DEFAULT_CONFIDENCE_THRESHOLD,
-    type=float, dest='confidence_threshold'
-)
-parser.add_argument(
-    '--matches_graph_dot_file', action='store',
-    default=Subsetter.DEFAULT_MATCHES_GRAPH_DOT_FILE,
-    help="Save matches graph represented in DOT language to <file_name> file.",
-    type=str, dest='matches_graph_dot_file'
-)
-parser.add_argument(
-    '--estimator', action='store',
-    default=CameraEstimator.DEFAULT_CAMERA_ESTIMATOR,
-    help="Type of estimator used for transformation estimation. "
-         "The default is '%s'." % CameraEstimator.DEFAULT_CAMERA_ESTIMATOR,
-    choices=CameraEstimator.CAMERA_ESTIMATOR_CHOICES.keys(),
-    type=str, dest='estimator'
-)
-parser.add_argument(
-    '--adjuster', action='store',
-    default=CameraAdjuster.DEFAULT_CAMERA_ADJUSTER,
-    help="Bundle adjustment cost function. "
-         "The default is '%s'." % CameraAdjuster.DEFAULT_CAMERA_ADJUSTER,
-    choices=CameraAdjuster.CAMERA_ADJUSTER_CHOICES.keys(),
-    type=str, dest='adjuster'
-)
-parser.add_argument(
-    '--refinement_mask', action='store',
-    default=CameraAdjuster.DEFAULT_REFINEMENT_MASK,
-    help="Set refinement mask for bundle adjustment. It looks like 'x_xxx', "
-         "where 'x' means refine respective parameter and '_' means don't "
-         "refine, and has the following format:<fx><skew><ppx><aspect><ppy>. "
-         "The default mask is '%s'. "
-         "If bundle adjustment doesn't support estimation of selected "
-         "parameter then the respective flag is ignored."
-         "" % CameraAdjuster.DEFAULT_REFINEMENT_MASK,
-    type=str, dest='refinement_mask'
-)
-parser.add_argument(
-    '--wave_correct_kind', action='store',
-    default=WaveCorrector.DEFAULT_WAVE_CORRECTION,
-    help="Perform wave effect correction. "
-         "The default is '%s'" % WaveCorrector.DEFAULT_WAVE_CORRECTION,
-    choices=WaveCorrector.WAVE_CORRECT_CHOICES.keys(),
-    type=str, dest='wave_correct_kind'
-)
-parser.add_argument(
-    '--warper_type', action='store', default=Warper.DEFAULT_WARP_TYPE,
-    help="Warp surface type. The default is '%s'." % Warper.DEFAULT_WARP_TYPE,
-    choices=Warper.WARP_TYPE_CHOICES,
-    type=str, dest='warper_type'
-)
-parser.add_argument(
-    '--low_megapix', action='store', default=ImageHandler.DEFAULT_LOW_MEGAPIX,
-    help="Resolution for seam estimation and exposure estimation step. "
-    "The default is %s Mpx." % ImageHandler.DEFAULT_LOW_MEGAPIX,
-    type=float, dest='low_megapix'
-)
-parser.add_argument(
-    '--crop', action='store', default=Cropper.DEFAULT_CROP,
-    help="Crop black borders around images caused by warping using the "
-    "largest interior rectangle. "
-    "Default is '%s'." % Cropper.DEFAULT_CROP,
-    type=bool, dest='crop'
-)
-parser.add_argument(
-    '--compensator', action='store',
-    default=ExposureErrorCompensator.DEFAULT_COMPENSATOR,
-    help="Exposure compensation method. "
-         "The default is '%s'." % ExposureErrorCompensator.DEFAULT_COMPENSATOR,
-    choices=ExposureErrorCompensator.COMPENSATOR_CHOICES.keys(),
-    type=str, dest='compensator'
-)
-parser.add_argument(
-    '--nr_feeds', action='store',
-    default=ExposureErrorCompensator.DEFAULT_NR_FEEDS,
-    help="Number of exposure compensation feed.",
-    type=np.int32, dest='nr_feeds'
-)
-parser.add_argument(
-    '--block_size', action='store',
-    default=ExposureErrorCompensator.DEFAULT_BLOCK_SIZE,
-    help="BLock size in pixels used by the exposure compensator. "
-         "The default is '%s'." % ExposureErrorCompensator.DEFAULT_BLOCK_SIZE,
-    type=np.int32, dest='block_size'
-)
-parser.add_argument(
-    '--finder', action='store', default=SeamFinder.DEFAULT_SEAM_FINDER,
-    help="Seam estimation method. "
-         "The default is '%s'." % SeamFinder.DEFAULT_SEAM_FINDER,
-    choices=SeamFinder.SEAM_FINDER_CHOICES.keys(),
-    type=str, dest='finder'
-)
-parser.add_argument(
-    '--final_megapix', action='store',
-    default=ImageHandler.DEFAULT_FINAL_MEGAPIX,
-    help="Resolution for compositing step. Use -1 for original resolution. "
-         "The default is %s" % ImageHandler.DEFAULT_FINAL_MEGAPIX,
-    type=float, dest='final_megapix'
-)
-parser.add_argument(
-    '--blender_type', action='store', default=Blender.DEFAULT_BLENDER,
-    help="Blending method. The default is '%s'." % Blender.DEFAULT_BLENDER,
-    choices=Blender.BLENDER_CHOICES,
-    type=str, dest='blender_type'
-)
-parser.add_argument(
-    '--blend_strength', action='store', default=Blender.DEFAULT_BLEND_STRENGTH,
-    help="Blending strength from [0,100] range. "
-         "The default is '%s'." % Blender.DEFAULT_BLEND_STRENGTH,
-    type=np.int32, dest='blend_strength'
-)
-parser.add_argument(
-    '--timelapse', action='store', default=Timelapser.DEFAULT_TIMELAPSE,
-    help="Output warped images separately as frames of a time lapse movie, "
-         "with 'fixed_' prepended to input file names. "
-         "The default is '%s'." % Timelapser.DEFAULT_TIMELAPSE,
-    choices=Timelapser.TIMELAPSE_CHOICES,
-    type=str, dest='timelapse'
-)
-parser.add_argument(
-    '--output', action='store', default='result.jpg',
-    help="The default is 'result.jpg'",
-    type=str, dest='output'
-)
-
-__doc__ += '\n' + parser.format_help()
-
-if __name__ == '__main__':
-    print(__doc__)
-    args = parser.parse_args()
-    args_dict = vars(args)
-
-    # Extract In- and Output
-    img_names = args_dict.pop("img_names")
-    img_names = [cv.samples.findFile(img_name) for img_name in img_names]
-    output = args_dict.pop("output")
-
-    stitcher = Stitcher(**args_dict)
-    panorama = stitcher.stitch(img_names)
-
-    cv.imwrite(output, panorama)
-
-    zoom_x = 600.0 / panorama.shape[1]
-    preview = cv.resize(panorama, dsize=None, fx=zoom_x, fy=zoom_x)
-
-    cv.imshow(output, preview)
-    cv.waitKey()
-    cv.destroyAllWindows()

From 12300750113b7bfd82cf5d39be8b6a3fd08f0fca Mon Sep 17 00:00:00 2001
From: Victor <kataev.victor.1234@gmail.com>
Date: Mon, 30 May 2022 21:03:08 +0300
Subject: [PATCH 064/178] Merge pull request #22045 from
 victor1234:calib3d-fisheye-model-reference

* Add fisheye camera model summary description with reference

* Fix layout

* Move reference to bibtex
---
 doc/opencv.bib                              | 10 ++++++++++
 modules/calib3d/include/opencv2/calib3d.hpp |  5 ++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/doc/opencv.bib b/doc/opencv.bib
index 800f57cafa..13fe71a1fb 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -1292,3 +1292,13 @@
   publisher={IEEE},
   doi = {10.1109/TPAMI.2021.3055337}
 }
+@article{Kannala2006,
+  author = {Kannala, Juho and Brandt, Sami},
+  year = {2006},
+  month = {09},
+  pages = {1335-40},
+  title = {A Generic Camera Model and Calibration Method for Conventional, Wide-Angle, and Fish-Eye Lenses},
+  volume = {28},
+  journal = {IEEE transactions on pattern analysis and machine intelligence},
+  doi = {10.1109/TPAMI.2006.153}
+}
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index d6b23d9e3e..63eded027f 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -429,6 +429,9 @@ R & t \\
     \f[u = f_x (x' + \alpha y') + c_x \\
     v = f_y y' + c_y\f]
 
+    Summary:
+    Generic camera model @cite Kannala2006 with perspective projection and without distortion correction
+
     @defgroup calib3d_c C API
 
   @}
@@ -2921,7 +2924,7 @@ namespace fisheye
     CV_EXPORTS_W void estimateNewCameraMatrixForUndistortRectify(InputArray K, InputArray D, const Size &image_size, InputArray R,
         OutputArray P, double balance = 0.0, const Size& new_size = Size(), double fov_scale = 1.0);
 
-    /** @brief Performs camera calibaration
+    /** @brief Performs camera calibration
 
     @param objectPoints vector of vectors of calibration pattern points in the calibration pattern
     coordinate space.

From 9cfae823a7e518ca73a6fb303114ab7050ab0802 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 31 May 2022 01:58:50 +0300
Subject: [PATCH 065/178] Fix Xcode version matching

---
 platforms/ios/build_framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index bc765872a0..4895493a54 100755
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -41,7 +41,7 @@ def execute(cmd, cwd = None):
         raise Exception("Child returned:", retcode)
 
 def getXCodeMajor():
-    ret = check_output(["xcodebuild", "-version"])
+    ret = check_output(["xcodebuild", "-version"]).decode("utf-8")
     m = re.match(r'Xcode\s+(\d+)\..*', ret, flags=re.IGNORECASE)
     if m:
         return int(m.group(1))

From 1a24e316d587aea18a712748fef60f75a3703ff4 Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Tue, 31 May 2022 19:58:17 +0900
Subject: [PATCH 066/178] Merge pull request #22030 from Kumataro:3.4-fix22029

* objdetect: qrcode_encoder: fix to missing timing pattern

* objdetect: qrcode_encoder: Add SCOPED_TRACE() and replace CV_Assert() to ASSERT_EQ().

- Add SCOPED_TRACE() for version loop.

- Replace CV_Assert() to ASSERT_EQ().

- Rename expect_msg to msg.
---
 modules/objdetect/src/qrcode_encoder.cpp      |  2 +-
 modules/objdetect/test/test_qrcode_encode.cpp | 78 +++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/modules/objdetect/src/qrcode_encoder.cpp b/modules/objdetect/src/qrcode_encoder.cpp
index 2b363b607d..1016a17936 100644
--- a/modules/objdetect/src/qrcode_encoder.cpp
+++ b/modules/objdetect/src/qrcode_encoder.cpp
@@ -975,7 +975,7 @@ void QRCodeEncoderImpl::writeReservedArea()
     original.at<uint8_t>(x, y) = INVALID_REGION_VALUE;
     if (version_level >= 7)
     {
-        for (int i = 0; i <= 6; i++)
+        for (int i = 0; i <= 5; i++)
         {
             for (int j = version_size - 11; j <= version_size - 8; j++)
             {
diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp
index 92ab73cff1..351e6abc83 100644
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@@ -430,4 +430,82 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
 
 #endif // UPDATE_QRCODE_TEST_DATA
 
+TEST(Objdetect_QRCode_Encode_Decode, regression_issue22029)
+{
+    const cv::String msg = "OpenCV";
+    const int min_version = 1;
+    const int max_version = 40;
+
+    for ( int v = min_version ; v <= max_version ; v++ )
+    {
+        SCOPED_TRACE(cv::format("version=%d",v));
+
+        Mat qrimg;
+        QRCodeEncoder::Params params;
+        params.version = v;
+        Ptr<QRCodeEncoder> qrcode_enc = cv::QRCodeEncoder::create(params);
+        qrcode_enc->encode(msg, qrimg);
+
+        const int white_margin = 2;
+        const int finder_width = 7;
+
+        const int timing_pos = white_margin + 6;
+        int i;
+
+        // Horizontal Check
+        // (1) White margin(Left)
+        for(i = 0; i < white_margin ; i++ )
+        {
+            ASSERT_EQ((uint8_t)255, qrimg.at<uint8_t>(i, timing_pos)) << "i=" << i;
+        }
+        // (2) Finder pattern(Left)
+        for(     ; i < white_margin + finder_width ; i++ )
+        {
+            ASSERT_EQ((uint8_t)0, qrimg.at<uint8_t>(i, timing_pos)) << "i=" << i;
+        }
+        // (3) Timing pattern
+        for(     ; i < qrimg.rows - finder_width - white_margin; i++ )
+        {
+            ASSERT_EQ((uint8_t)(i % 2 == 0)?0:255, qrimg.at<uint8_t>(i, timing_pos)) << "i=" << i;
+        }
+        // (4) Finder pattern(Right)
+        for(     ; i < qrimg.rows - white_margin; i++ )
+        {
+            ASSERT_EQ((uint8_t)0, qrimg.at<uint8_t>(i, timing_pos)) << "i=" << i;
+        }
+        // (5) White margin(Right)
+        for(     ; i < qrimg.rows ; i++ )
+        {
+            ASSERT_EQ((uint8_t)255, qrimg.at<uint8_t>(i, timing_pos)) << "i=" << i;
+        }
+
+        // Vertical Check
+        // (1) White margin(Top)
+        for(i = 0; i < white_margin ; i++ )
+        {
+            ASSERT_EQ((uint8_t)255, qrimg.at<uint8_t>(timing_pos, i)) << "i=" << i;
+        }
+        // (2) Finder pattern(Top)
+        for(     ; i < white_margin + finder_width ; i++ )
+        {
+            ASSERT_EQ((uint8_t)0, qrimg.at<uint8_t>(timing_pos, i)) << "i=" << i;
+        }
+        // (3) Timing pattern
+        for(     ; i < qrimg.rows - finder_width - white_margin; i++ )
+        {
+            ASSERT_EQ((uint8_t)(i % 2 == 0)?0:255, qrimg.at<uint8_t>(timing_pos, i)) << "i=" << i;
+        }
+        // (4) Finder pattern(Bottom)
+        for(     ; i < qrimg.rows - white_margin; i++ )
+        {
+            ASSERT_EQ((uint8_t)0, qrimg.at<uint8_t>(timing_pos, i)) << "i=" << i;
+        }
+        // (5) White margin(Bottom)
+        for(     ; i < qrimg.rows ; i++ )
+        {
+            ASSERT_EQ((uint8_t)255, qrimg.at<uint8_t>(timing_pos, i)) << "i=" << i;
+        }
+    }
+}
+
 }} // namespace

From 38b6c44b4c78077e5a96b1f159db26693197d649 Mon Sep 17 00:00:00 2001
From: Ed J <mohawk2@users.noreply.github.com>
Date: Wed, 1 Jun 2022 00:51:55 +0100
Subject: [PATCH 067/178] for then given -> for the given

---
 modules/calib3d/include/opencv2/calib3d.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 63eded027f..da27f50869 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -2117,7 +2117,7 @@ described below.
 therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
 length.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
 recover pose. In the output mask only inliers which pass the cheirality check.
 
 This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
@@ -2168,7 +2168,7 @@ length.
 are feature points from cameras with same focal length and principal point.
 @param pp principal point of the camera.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
 recover pose. In the output mask only inliers which pass the cheirality check.
 
 This function differs from the one above that it computes camera intrinsic matrix from focal length and
@@ -2204,7 +2204,7 @@ length.
 @param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite
 points).
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
 recover pose. In the output mask only inliers which pass the cheirality check.
 @param triangulatedPoints 3D points which were reconstructed by triangulation.
 

From f28e191d709a7483859d21084ab5708348e9da1c Mon Sep 17 00:00:00 2001
From: Ed J <mohawk2@users.noreply.github.com>
Date: Wed, 1 Jun 2022 00:53:33 +0100
Subject: [PATCH 068/178] cheirality -> chirality

---
 modules/calib3d/include/opencv2/calib3d.hpp | 12 ++++++------
 modules/calib3d/src/five-point.cpp          |  2 +-
 modules/calib3d/src/sqpnp.hpp               |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index da27f50869..c2d23e92f3 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -2099,7 +2099,7 @@ unit length.
 CV_EXPORTS_W void decomposeEssentialMat( InputArray E, OutputArray R1, OutputArray R2, OutputArray t );
 
 /** @brief Recovers the relative camera rotation and the translation from an estimated essential
-matrix and the corresponding points in two images, using cheirality check. Returns the number of
+matrix and the corresponding points in two images, using chirality check. Returns the number of
 inliers that pass the check.
 
 @param E The input essential matrix.
@@ -2118,10 +2118,10 @@ therefore is only known up to scale, i.e. t is the direction of the translation
 length.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
 inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+recover pose. In the output mask only inliers which pass the chirality check.
 
 This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
-possible pose hypotheses by doing cheirality check. The cheirality check means that the
+possible pose hypotheses by doing chirality check. The chirality check means that the
 triangulated 3D points should have positive depth. Some details can be found in @cite Nister03.
 
 This function can be used to process the output E and mask from @ref findEssentialMat. In this
@@ -2169,7 +2169,7 @@ are feature points from cameras with same focal length and principal point.
 @param pp principal point of the camera.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
 inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+recover pose. In the output mask only inliers which pass the chirality check.
 
 This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:
@@ -2205,11 +2205,11 @@ length.
 points).
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
 inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+recover pose. In the output mask only inliers which pass the chirality check.
 @param triangulatedPoints 3D points which were reconstructed by triangulation.
 
 This function differs from the one above that it outputs the triangulated 3D point that are used for
-the cheirality check.
+the chirality check.
  */
 CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
                             InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
diff --git a/modules/calib3d/src/five-point.cpp b/modules/calib3d/src/five-point.cpp
index dbdd294c7e..f78fab0b97 100644
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@@ -520,7 +520,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2,
     P3(Range::all(), Range(0, 3)) = R1 * 1.0; P3.col(3) = -t * 1.0;
     P4(Range::all(), Range(0, 3)) = R2 * 1.0; P4.col(3) = -t * 1.0;
 
-    // Do the cheirality check.
+    // Do the chirality check.
     // Notice here a threshold dist is used to filter
     // out far away points (i.e. infinite points) since
     // their depth may vary between positive and negative.
diff --git a/modules/calib3d/src/sqpnp.hpp b/modules/calib3d/src/sqpnp.hpp
index 97c10e34e7..66c478f371 100644
--- a/modules/calib3d/src/sqpnp.hpp
+++ b/modules/calib3d/src/sqpnp.hpp
@@ -111,7 +111,7 @@ private:
     static double det3x3(const cv::Matx<double, 9, 1>& e);
 
     /*
-    * @brief                Tests the cheirality for a given solution.
+    * @brief                Tests the chirality for a given solution.
     * @param solution       The solution to evaluate.
     */
     inline bool positiveDepth(const SQPSolution& solution) const;

From 3577265508ee5a39720284034380e9164d624d2d Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Wed, 1 Jun 2022 21:45:53 +0900
Subject: [PATCH 069/178] highgui: Qt: waitKey() supports Cyrillic/Japanese
 keyboard layout

---
 modules/highgui/src/window_QT.cpp | 53 +++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index e7ee43cf5e..555ba8ca2b 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -2194,23 +2194,58 @@ void CvWindow::displayPropertiesWin()
         global_control_panel->hide();
 }
 
+static bool isTranslatableKey(Qt::Key key)
+{
+    // https://github.com/opencv/opencv/issues/21899
+    // https://doc.qt.io/qt-5/qt.html#Key-enum
+    // https://doc.qt.io/qt-6/qt.html#Key-enum
+    // https://github.com/qt/qtbase/blob/dev/src/testlib/qasciikey.cpp
+
+    bool ret = false;
+
+    switch ( key )
+    {
+        // Special keys
+        case Qt::Key_Escape:
+        case Qt::Key_Tab:
+        case Qt::Key_Backtab:
+        case Qt::Key_Backspace:
+        case Qt::Key_Enter:
+        case Qt::Key_Return:
+            ret = true;
+            break;
+
+        // latin-1 keys.
+        default:
+        ret = (
+            ( ( Qt::Key_Space        <= key ) && ( key <= Qt::Key_AsciiTilde ) ) // 0x20--0x7e
+            ||
+            ( ( Qt::Key_nobreakspace <= key ) && ( key <= Qt::Key_ssharp     ) ) // 0x0a0--0x0de
+            ||
+            ( key == Qt::Key_division )                                          // 0x0f7
+            ||
+            ( key == Qt::Key_ydiaeresis )                                        // 0x0ff
+        );
+        break;
+    }
+
+    return ret;
+}
 
 //Need more test here !
 void CvWindow::keyPressEvent(QKeyEvent *evnt)
 {
-    //see http://doc.trolltech.com/4.6/qt.html#Key-enum
     int key = evnt->key();
+    const Qt::Key qtkey = static_cast<Qt::Key>(key);
 
-        Qt::Key qtkey = static_cast<Qt::Key>(key);
-        char asciiCode = QTest::keyToAscii(qtkey);
-        if (asciiCode != 0)
-            key = static_cast<int>(asciiCode);
-        else
-            key = evnt->nativeVirtualKey(); //same codes as returned by GTK-based backend
+    if ( isTranslatableKey( qtkey ) )
+        key = static_cast<int>( QTest::keyToAscii( qtkey ) );
+    else
+        key = evnt->nativeVirtualKey(); //same codes as returned by GTK-based backend
 
     //control plus (Z, +, -, up, down, left, right) are used for zoom/panning functions
-        if (evnt->modifiers() != Qt::ControlModifier)
-        {
+    if (evnt->modifiers() != Qt::ControlModifier)
+    {
         mutexKey.lock();
         last_key = key;
         mutexKey.unlock();

From 65fcf2267096996ac8f790a699bea1b20f477d1f Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 1 Jun 2022 19:02:56 +0000
Subject: [PATCH 070/178] imgproc: use singleton in color_hsv.simd.hpp

---
 modules/core/src/precomp.hpp           |  1 +
 modules/imgproc/src/color_hsv.simd.hpp | 66 +++++++++++++++++++-------
 modules/imgproc/src/precomp.hpp        | 10 +++-
 3 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index d30855ef1e..0efe4db5b1 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -372,6 +372,7 @@ extern CV_EXPORTS
 bool __termination;  // skip some cleanups, because process is terminating
                      // (for example, if ExitProcess() was already called)
 
+CV_EXPORTS
 cv::Mutex& getInitializationMutex();
 
 /// @brief Returns timestamp in nanoseconds since program launch
diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp
index b1eb50d7a4..424a6d1494 100644
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@@ -39,36 +39,65 @@ struct RGB2HSV_b
     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
     {
         CV_Assert( hrange == 180 || hrange == 256 );
+
+        const TablesSingleton& global_tables = TablesSingleton::getInstance();
+        hdiv_table_ = hrange == 180 ? global_tables.hdiv_table180 : global_tables.hdiv_table256;
+        sdiv_table_ = global_tables.sdiv_table;
     }
 
-    void operator()(const uchar* src, uchar* dst, int n) const
+    struct TablesSingleton
     {
-        CV_INSTRUMENT_REGION();
+        int sdiv_table[256];
+        int hdiv_table180[256];
+        int hdiv_table256[256];
 
-        int i, bidx = blueIdx, scn = srccn;
-        const int hsv_shift = 12;
-
-        static int sdiv_table[256];
-        static int hdiv_table180[256];
-        static int hdiv_table256[256];
-        static volatile bool initialized = false;
-
-        int hr = hrange;
-        const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
-
-        if( !initialized )
+    protected:
+        TablesSingleton()
         {
+            const int hsv_shift = 12;
+
             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
-            for( i = 1; i < 256; i++ )
+            for (int i = 1; i < 256; i++)
             {
                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
             }
-            initialized = true;
         }
+    public:
+        static TablesSingleton& getInstance()
+        {
+#ifdef CV_CXX11
+            static TablesSingleton g_tables;
+            return g_tables;
+#else
+            static TablesSingleton* volatile g_tables = NULL;
+            if (!g_tables)
+            {
+                AutoLock lock(getInitializationMutex());
+                if (!g_tables)
+                {
+                    static TablesSingleton g_tablesInstance;
+                    g_tables = &g_tablesInstance;
+                }
+            }
+            return *g_tables;
+#endif
+        }
+    };
 
-        i = 0;
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        CV_INSTRUMENT_REGION();
+
+        int bidx = blueIdx, scn = srccn;
+        const int hsv_shift = 12;
+
+        int hr = hrange;
+        const int* hdiv_table/*[256]*/ = hdiv_table_;
+        const int* sdiv_table/*[256]*/ = sdiv_table_;
+
+        int i = 0;
 
 #if CV_SIMD
         const int vsize = v_uint8::nlanes;
@@ -231,6 +260,9 @@ struct RGB2HSV_b
     }
 
     int srccn, blueIdx, hrange;
+
+    const int* hdiv_table_/*[256]*/;
+    const int* sdiv_table_/*[256]*/;
 };
 
 
diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp
index b300192e9c..d69da268d6 100644
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@@ -119,4 +119,12 @@ inline bool isStorageOrMat(void * arr)
     CV_Error( CV_StsBadArg, "Destination is not CvMemStorage* nor CvMat*" );
 }
 
-#endif /*__OPENCV_CV_INTERNAL_H_*/
+
+namespace cv {
+
+CV_EXPORTS
+cv::Mutex& getInitializationMutex();  // defined in core module
+
+}  // namespace cv
+
+#endif /*__OPENCV_PRECOMP_H__*/

From 53eda42da7b5cc87a1049609acd709dde596da39 Mon Sep 17 00:00:00 2001
From: Alexander Panov <alexander.panov@xperience.ai>
Date: Sat, 4 Jun 2022 17:33:08 +0300
Subject: [PATCH 071/178] Merge pull request #22025 from
 AleksandrPanov:fix_samplingForVersion_multiplyingFactor

Fix sampling for version multiplying factor

* reduce experimentalFrequencyElem and listFrequencyElem

* fix large resize

* fix tile in postIntermediate

* add getMinSideLen(), add corrected_index

* add test decode_regression_21929 author Kumataro, add test decode_regression_version_25
---
 .../objdetect/perf/perf_qrcode_pipeline.cpp   |  3 +-
 modules/objdetect/src/qrcode.cpp              | 76 +++++++++---------
 modules/objdetect/test/test_qrcode.cpp        | 77 ++++++++++++++++++-
 3 files changed, 116 insertions(+), 40 deletions(-)

diff --git a/modules/objdetect/perf/perf_qrcode_pipeline.cpp b/modules/objdetect/perf/perf_qrcode_pipeline.cpp
index 716eb2d779..9e7960d819 100644
--- a/modules/objdetect/perf/perf_qrcode_pipeline.cpp
+++ b/modules/objdetect/perf/perf_qrcode_pipeline.cpp
@@ -106,10 +106,11 @@ PERF_TEST_P_(Perf_Objdetect_QRCode_Multi, decodeMulti)
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Perf_Objdetect_QRCode,
     ::testing::Values(
         "version_1_down.jpg", "version_1_left.jpg", "version_1_right.jpg", "version_1_up.jpg", "version_1_top.jpg",
-        "version_5_down.jpg", "version_5_left.jpg", "version_5_right.jpg", "version_5_up.jpg", "version_5_top.jpg",
+        "version_5_down.jpg", "version_5_left.jpg",/*version_5_right.jpg*/ "version_5_up.jpg", "version_5_top.jpg",
         "russian.jpg", "kanji.jpg", "link_github_ocv.jpg", "link_ocv.jpg", "link_wiki_cv.jpg"
     )
 );
+// version_5_right.jpg DISABLED after tile fix, PR #22025
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Perf_Objdetect_QRCode_Multi,
     ::testing::Values(
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 1479aabf08..d720f1b80b 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -1070,6 +1070,15 @@ protected:
     };
 };
 
+float static getMinSideLen(const vector<Point2f> &points) {
+    CV_Assert(points.size() == 4ull);
+    double res = norm(points[1]-points[0]);
+    for (size_t i = 1ull; i < points.size(); i++) {
+        res = min(res, norm(points[i]-points[(i+1ull) % points.size()]));
+    }
+    return static_cast<float>(res);
+}
+
 void QRDecode::init(const Mat &src, const vector<Point2f> &points)
 {
     CV_TRACE_FUNCTION();
@@ -1081,7 +1090,7 @@ void QRDecode::init(const Mat &src, const vector<Point2f> &points)
     original_points = bbox;
     version = 0;
     version_size = 0;
-    test_perspective_size = 251;
+    test_perspective_size = max(getMinSideLen(points)+1.f, 251.f);
     result_info = "";
 }
 
@@ -2096,7 +2105,7 @@ bool QRDecode::straightenQRCodeInParts()
     {
         return false;
     }
-    float perspective_curved_size = 251.0;
+    float perspective_curved_size = max(getMinSideLen(original_points)+1.f, 251.f);;
     const Size temporary_size(cvRound(perspective_curved_size), cvRound(perspective_curved_size));
 
     float dist = perspective_curved_size / (number_pnts_to_cut - 1);
@@ -2367,9 +2376,9 @@ bool QRDecode::versionDefinition()
 bool QRDecode::samplingForVersion()
 {
     CV_TRACE_FUNCTION();
-    const double multiplyingFactor = (version < 3)  ? 1 :
-                                     (version == 3) ? 1.5 :
-                                     version * (version + 1);
+    const double multiplyingFactor = (version < 3)  ? 1. :
+                                     (version == 3) ? 2. :
+                                     3.;
     const Size newFactorSize(
                   cvRound(no_border_intermediate.size().width  * multiplyingFactor),
                   cvRound(no_border_intermediate.size().height * multiplyingFactor));
@@ -2378,45 +2387,38 @@ bool QRDecode::samplingForVersion()
 
     const int delta_rows = cvRound((postIntermediate.rows * 1.0) / version_size);
     const int delta_cols = cvRound((postIntermediate.cols * 1.0) / version_size);
+    // number of elements in the tail
+    const int skipped_rows = postIntermediate.rows - delta_rows * version_size;
+    const int skipped_cols = postIntermediate.cols - delta_cols * version_size;
 
-    vector<double> listFrequencyElem;
-    for (int r = 0; r < postIntermediate.rows; r += delta_rows)
-    {
-        for (int c = 0; c < postIntermediate.cols; c += delta_cols)
-        {
+    vector<int> deltas_rows(version_size, delta_rows);
+    vector<int> deltas_cols(version_size, delta_cols);
+
+    for (int i = 0; i < abs(skipped_rows); i++) {
+        // fix deltas_rows at each skip_step
+        const double skip_step = static_cast<double>(version_size)/abs(skipped_rows);
+        const int corrected_index = static_cast<int>(i*skip_step + skip_step/2);
+        deltas_rows[corrected_index] += skipped_rows > 0 ? 1 : -1;
+    }
+    for (int i = 0; i < abs(skipped_cols); i++) {
+        // fix deltas_cols at each skip_step
+        const double skip_step = static_cast<double>(version_size)/abs(skipped_cols);
+        const int corrected_index = static_cast<int>(i*skip_step + skip_step/2);
+        deltas_cols[corrected_index] += skipped_cols > 0 ? 1 : -1;
+    }
+
+    const double totalFrequencyElem = countNonZero(postIntermediate) / static_cast<double>(postIntermediate.total());
+    straight = Mat(Size(version_size, version_size), CV_8UC1, Scalar(0));
+
+    for (int r = 0, i = 0; i < version_size; r += deltas_rows[i], i++) {
+        for (int c = 0, j = 0; j < version_size; c += deltas_cols[j], j++) {
             Mat tile = postIntermediate(
                            Range(r, min(r + delta_rows, postIntermediate.rows)),
                            Range(c, min(c + delta_cols, postIntermediate.cols)));
             const double frequencyElem = (countNonZero(tile) * 1.0) / tile.total();
-            listFrequencyElem.push_back(frequencyElem);
+            straight.ptr<uint8_t>(i)[j] = (frequencyElem < totalFrequencyElem) ? 0 : 255;
         }
     }
-
-    double dispersionEFE = std::numeric_limits<double>::max();
-    double experimentalFrequencyElem = 0;
-    for (double expVal = 0; expVal < 1; expVal+=0.001)
-    {
-        double testDispersionEFE = 0.0;
-        for (size_t i = 0; i < listFrequencyElem.size(); i++)
-        {
-            testDispersionEFE += (listFrequencyElem[i] - expVal) *
-                                 (listFrequencyElem[i] - expVal);
-        }
-        testDispersionEFE /= (listFrequencyElem.size() - 1);
-        if (dispersionEFE > testDispersionEFE)
-        {
-            dispersionEFE = testDispersionEFE;
-            experimentalFrequencyElem = expVal;
-        }
-    }
-
-    straight = Mat(Size(version_size, version_size), CV_8UC1, Scalar(0));
-    for (int r = 0; r < version_size * version_size; r++)
-    {
-        int i   = r / straight.cols;
-        int j   = r % straight.cols;
-        straight.ptr<uint8_t>(i)[j] = (listFrequencyElem[r] < experimentalFrequencyElem) ? 0 : 255;
-    }
     return true;
 }
 
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index 19a9f76260..b5680387cb 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -11,8 +11,9 @@ std::string qrcode_images_name[] = {
   "version_2_down.jpg", "version_2_left.jpg", "version_2_right.jpg", "version_2_up.jpg", "version_2_top.jpg",
   "version_3_down.jpg", "version_3_left.jpg", "version_3_right.jpg", "version_3_up.jpg", "version_3_top.jpg",
   "version_4_down.jpg", "version_4_left.jpg", "version_4_right.jpg", "version_4_up.jpg", "version_4_top.jpg",
-  "version_5_down.jpg", "version_5_left.jpg", "version_5_right.jpg", "version_5_up.jpg", "version_5_top.jpg",
+  "version_5_down.jpg", "version_5_left.jpg"/*"version_5_right.jpg"*/,
   "russian.jpg", "kanji.jpg", "link_github_ocv.jpg", "link_ocv.jpg", "link_wiki_cv.jpg"
+// version_5_right.jpg DISABLED after tile fix, PR #22025
 };
 
 std::string qrcode_images_close[] = {
@@ -22,8 +23,9 @@ std::string qrcode_images_monitor[] = {
   "monitor_1.png", "monitor_2.png", "monitor_3.png", "monitor_4.png", "monitor_5.png"
 };
 std::string qrcode_images_curved[] = {
-  "curved_1.jpg", "curved_2.jpg", "curved_3.jpg", "curved_4.jpg", "curved_5.jpg", "curved_6.jpg", "curved_7.jpg", "curved_8.jpg"
+  "curved_1.jpg", "curved_2.jpg", "curved_3.jpg", /*"curved_4.jpg",*/ "curved_5.jpg", /*"curved_6.jpg",*/ "curved_7.jpg", "curved_8.jpg"
 };
+// curved_4.jpg, curved_6.jpg DISABLED after tile fix, PR #22025
 std::string qrcode_images_multiple[] = {
   "2_qrcodes.png", "3_close_qrcodes.png", "3_qrcodes.png", "4_qrcodes.png",
   "5_qrcodes.png", "6_qrcodes.png", "7_qrcodes.png", "8_close_qrcodes.png"
@@ -683,7 +685,78 @@ TEST(Objdetect_QRCode_basic, not_found_qrcode)
 #endif
 }
 
+TEST(Objdetect_QRCode_detect, detect_regression_21287)
+{
+    const std::string name_current_image = "issue_21287.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat src = imread(image_path);
+    ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight_barcode;
+    cv::String decoded_info;
+    EXPECT_TRUE(qrcode.detect(src, corners));
+    EXPECT_TRUE(!corners.empty());
+#ifdef HAVE_QUIRC
+    EXPECT_NO_THROW(qrcode.decode(src, corners, straight_barcode));
+#endif
+}
+
+// @author Kumataro, https://github.com/Kumataro
+TEST(Objdetect_QRCode_decode, decode_regression_21929)
+{
+    const cv::String expect_msg = "OpenCV";
+    Mat qrImg;
+    QRCodeEncoder::Params params;
+    params.version = 8; // 49x49
+    Ptr<QRCodeEncoder> qrcode_enc = cv::QRCodeEncoder::create(params);;
+    qrcode_enc->encode(expect_msg, qrImg);
+
+    Mat src;
+    cv::resize(qrImg, src, Size(200,200), 1.0, 1.0, INTER_NEAREST);
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight_barcode;
+
+    EXPECT_TRUE(qrcode.detect(src, corners));
+    EXPECT_TRUE(!corners.empty());
+#ifdef HAVE_QUIRC
+    cv::String decoded_msg;
+    EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
+    ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
+    EXPECT_EQ(expect_msg, decoded_msg);
+#endif
+}
+
+TEST(Objdetect_QRCode_decode, decode_regression_version_25)
+{
+    const cv::String expect_msg = "OpenCV";
+    Mat qrImg;
+    QRCodeEncoder::Params params;
+    params.version = 25; // 117x117
+    Ptr<QRCodeEncoder> qrcode_enc = cv::QRCodeEncoder::create(params);;
+    qrcode_enc->encode(expect_msg, qrImg);
 
+    Mat src;
+    cv::resize(qrImg, src, qrImg.size()*3, 1.0, 1.0, INTER_NEAREST);
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight_barcode;
+
+    EXPECT_TRUE(qrcode.detect(src, corners));
+    EXPECT_TRUE(!corners.empty());
+#ifdef HAVE_QUIRC
+    cv::String decoded_msg;
+    EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
+    ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
+    EXPECT_EQ(expect_msg, decoded_msg);
+#endif
+}
 
 #endif // UPDATE_QRCODE_TEST_DATA
 

From a3d088231745d147681d90579ef6e5380d7e5cb5 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 5 Jun 2022 07:52:44 +0000
Subject: [PATCH 072/178] release: OpenCV 3.4.18

---
 modules/core/include/opencv2/core/version.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index 4c08d4e3cb..4aa50024b0 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -8,7 +8,7 @@
 #define CV_VERSION_MAJOR    3
 #define CV_VERSION_MINOR    4
 #define CV_VERSION_REVISION 18
-#define CV_VERSION_STATUS   "-pre"
+#define CV_VERSION_STATUS   ""
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)

From adf89bbb33c6429f6e6d47433ee6af3f93fa35f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20L=C3=B3pez=20Rubio?= <j.lopez.r@gmail.com>
Date: Sun, 5 Jun 2022 17:30:16 +0200
Subject: [PATCH 073/178] Merge pull request #22047 from
 jlopezr:trackbar-ordered-in-mac

* In Mac highgui now shows trackbars in creation order

* In Mac highgui trackbars show current value

* Remove trailing spaces in objectivec code
---
 modules/highgui/src/window_cocoa.mm | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/highgui/src/window_cocoa.mm b/modules/highgui/src/window_cocoa.mm
index 97a6831f1a..e9cf5a3c1b 100644
--- a/modules/highgui/src/window_cocoa.mm
+++ b/modules/highgui/src/window_cocoa.mm
@@ -92,6 +92,7 @@ static bool wasInitialized = false;
 @interface CVSlider : NSView {
     NSSlider *slider;
     NSTextField *name;
+    NSString *initialName;
     int *value;
     void *userData;
     CvTrackbarCallback callback;
@@ -99,6 +100,7 @@ static bool wasInitialized = false;
 }
 @property(retain) NSSlider *slider;
 @property(retain) NSTextField *name;
+@property(retain) NSString *initialName;
 @property(assign) int *value;
 @property(assign) void *userData;
 @property(assign) CvTrackbarCallback callback;
@@ -107,6 +109,7 @@ static bool wasInitialized = false;
 
 @interface CVWindow : NSWindow {
     NSMutableDictionary *sliders;
+    NSMutableArray *slidersKeys;
     CvMouseCallback mouseCallback;
     void *mouseParam;
     BOOL autosize;
@@ -121,6 +124,7 @@ static bool wasInitialized = false;
 @property(assign) int x0;
 @property(assign) int y0;
 @property(retain) NSMutableDictionary *sliders;
+@property(retain) NSMutableArray *slidersKeys;
 @property(readwrite) int status;
 - (CVView *)contentView;
 - (void)cvSendMouseEvent:(NSEvent *)event type:(int)type flags:(int)flags;
@@ -842,6 +846,7 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
 @synthesize x0;
 @synthesize y0;
 @synthesize sliders;
+@synthesize slidersKeys;
 @synthesize status;
 
 - (void)cvSendMouseEvent:(NSEvent *)event type:(int)type flags:(int)flags {
@@ -933,6 +938,9 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
     if(sliders == nil)
         sliders = [[NSMutableDictionary alloc] init];
 
+    if(slidersKeys == nil)
+        slidersKeys = [[NSMutableArray alloc] init];
+
     NSString *cvname = [NSString stringWithFormat:@"%s", name];
 
     // Avoid overwriting slider
@@ -942,18 +950,23 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
     // Create slider
     CVSlider *slider = [[CVSlider alloc] init];
     [[slider name] setStringValue:cvname];
+    slider.initialName = [NSString stringWithFormat:@"%s", name];
     [[slider slider] setMaxValue:max];
     [[slider slider] setMinValue:0];
     if(value)
     {
         [[slider slider] setIntValue:*value];
         [slider setValue:value];
+        NSString *temp = [slider initialName];
+        NSString *text = [NSString stringWithFormat:@"%@ %d", temp, *value];
+        [[slider name] setStringValue: text];
     }
     if(callback)
         [slider setCallback:callback];
 
     // Save slider
     [sliders setValue:slider forKey:cvname];
+    [slidersKeys addObject:cvname];
     [[self contentView] addSubview:slider];
 
 
@@ -1092,7 +1105,7 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
 
     CVWindow *cvwindow = (CVWindow *)[self window];
     if ([cvwindow respondsToSelector:@selector(sliders)]) {
-        for(NSString *key in [cvwindow sliders]) {
+        for(NSString *key in [cvwindow slidersKeys]) {
             CVSlider *slider = [[cvwindow sliders] valueForKey:key];
             NSRect r = [slider frame];
             r.origin.y = height - r.size.height;
@@ -1144,6 +1157,7 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
 
 @synthesize slider;
 @synthesize name;
+@synthesize initialName;
 @synthesize value;
 @synthesize userData;
 @synthesize callback;
@@ -1186,6 +1200,9 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
 - (void)sliderChanged:(NSNotification *)notification {
     (void)notification;
     int pos = [slider intValue];
+    NSString *temp = [self initialName];
+    NSString *text = [NSString stringWithFormat:@"%@ %d", temp, *value];
+    [name setStringValue: text];
     if(value)
         *value = pos;
     if(callback)

From b0dc474160e389b9c9045da5db49d03ae17c6a6b Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 5 Jun 2022 15:32:44 +0000
Subject: [PATCH 074/178] release: OpenCV 4.6.0

---
 modules/core/include/opencv2/core/version.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index 1832aa75b8..7b129b6dd3 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -8,7 +8,7 @@
 #define CV_VERSION_MAJOR    4
 #define CV_VERSION_MINOR    6
 #define CV_VERSION_REVISION 0
-#define CV_VERSION_STATUS   "-pre"
+#define CV_VERSION_STATUS   ""
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)

From 533bb035cfac6b32d8cb94009202df0c13b65e09 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Wed, 8 Jun 2022 13:59:55 +0300
Subject: [PATCH 075/178] Save Frames option for interactive calibration tool

The option to save all frames that contribute to final calibration result.
Useful for dataset collection and further offline tuning.
---
 apps/interactive-calibration/calibCommon.hpp       |  4 ++++
 apps/interactive-calibration/calibController.cpp   | 14 ++++++++++++++
 apps/interactive-calibration/frameProcessor.cpp    |  9 +++++++++
 apps/interactive-calibration/frameProcessor.hpp    |  1 +
 apps/interactive-calibration/main.cpp              |  1 +
 .../parametersController.cpp                       |  1 +
 6 files changed, 30 insertions(+)

diff --git a/apps/interactive-calibration/calibCommon.hpp b/apps/interactive-calibration/calibCommon.hpp
index f422ff0531..617c17dbcf 100644
--- a/apps/interactive-calibration/calibCommon.hpp
+++ b/apps/interactive-calibration/calibCommon.hpp
@@ -45,6 +45,8 @@ namespace calib
         double totalAvgErr;
         cv::Size imageSize;
 
+        std::vector<cv::Mat> allFrames;
+
         std::vector<std::vector<cv::Point2f> > imagePoints;
         std::vector< std::vector<cv::Point3f> > objectPoints;
 
@@ -91,6 +93,7 @@ namespace calib
         cv::Size cameraResolution;
         int maxFramesNum;
         int minFramesNum;
+        bool saveFrames;
 
         captureParameters()
         {
@@ -100,6 +103,7 @@ namespace calib
             minFramesNum = 10;
             fps = 30;
             cameraResolution = cv::Size(IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT);
+            saveFrames = false;
         }
     };
 
diff --git a/apps/interactive-calibration/calibController.cpp b/apps/interactive-calibration/calibController.cpp
index efe937bfd9..9efb9d054a 100644
--- a/apps/interactive-calibration/calibController.cpp
+++ b/apps/interactive-calibration/calibController.cpp
@@ -10,6 +10,7 @@
 
 #include <opencv2/calib3d.hpp>
 #include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
 
 double calib::calibController::estimateCoverageQuality()
 {
@@ -212,6 +213,9 @@ void calib::calibDataController::filterFrames()
         }
         showOverlayMessage(cv::format("Frame %zu is worst", worstElemIndex + 1));
 
+        if(mCalibData->allFrames.size())
+            mCalibData->allFrames.erase(mCalibData->allFrames.begin() + worstElemIndex);
+
         if(mCalibData->imagePoints.size()) {
             mCalibData->imagePoints.erase(mCalibData->imagePoints.begin() + worstElemIndex);
             mCalibData->objectPoints.erase(mCalibData->objectPoints.begin() + worstElemIndex);
@@ -239,6 +243,11 @@ void calib::calibDataController::setParametersFileName(const std::string &name)
 
 void calib::calibDataController::deleteLastFrame()
 {
+    if(!mCalibData->allFrames.empty())
+    {
+        mCalibData->allFrames.pop_back();
+    }
+
     if( !mCalibData->imagePoints.empty()) {
         mCalibData->imagePoints.pop_back();
         mCalibData->objectPoints.pop_back();
@@ -269,6 +278,7 @@ void calib::calibDataController::rememberCurrentParameters()
 
 void calib::calibDataController::deleteAllData()
 {
+    mCalibData->allFrames.clear();
     mCalibData->imagePoints.clear();
     mCalibData->objectPoints.clear();
     mCalibData->allCharucoCorners.clear();
@@ -280,6 +290,10 @@ void calib::calibDataController::deleteAllData()
 
 bool calib::calibDataController::saveCurrentCameraParameters() const
 {
+
+    for(size_t i = 0; i < mCalibData->allFrames.size(); i++)
+        cv::imwrite(cv::format("calibration_%zu.png", i), mCalibData->allFrames[i]);
+
     bool success = false;
     if(mCalibData->cameraMatrix.total()) {
             cv::FileStorage parametersWriter(mParamsFileName, cv::FileStorage::WRITE);
diff --git a/apps/interactive-calibration/frameProcessor.cpp b/apps/interactive-calibration/frameProcessor.cpp
index 90904a42cd..2c2a1cd5aa 100644
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@@ -266,6 +266,7 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
                                    static_cast<float>(mCalibData->imageSize.width * mCalibData->imageSize.width)) / 20.0;
     mSquareSize = capParams.squareSize;
     mTemplDist = capParams.templDst;
+    mSaveFrames = capParams.saveFrames;
 
     switch(mBoardType)
     {
@@ -291,10 +292,14 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
 cv::Mat CalibProcessor::processFrame(const cv::Mat &frame)
 {
     cv::Mat frameCopy;
+    cv::Mat frameCopyToSave;
     frame.copyTo(frameCopy);
     bool isTemplateFound = false;
     mCurrentImagePoints.clear();
 
+    if(mSaveFrames)
+        frame.copyTo(frameCopyToSave);
+
     switch(mBoardType)
     {
     case Chessboard:
@@ -322,6 +327,10 @@ cv::Mat CalibProcessor::processFrame(const cv::Mat &frame)
                                                                                         mCalibData->allCharucoCorners.size()));
                 if(!showOverlayMessage(displayMessage))
                     showCaptureMessage(frame, displayMessage);
+
+                if(mSaveFrames)
+                    mCalibData->allFrames.push_back(frameCopyToSave);
+
                 mCapuredFrames++;
             }
             else {
diff --git a/apps/interactive-calibration/frameProcessor.hpp b/apps/interactive-calibration/frameProcessor.hpp
index 6fd788dc7f..88e87f7b98 100644
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@@ -50,6 +50,7 @@ protected:
     double mMaxTemplateOffset;
     float mSquareSize;
     float mTemplDist;
+    bool mSaveFrames;
 
     bool detectAndParseChessboard(const cv::Mat& frame);
     bool detectAndParseChAruco(const cv::Mat& frame);
diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index 6468c88ce7..b5c3642bb6 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -40,6 +40,7 @@ const std::string keys  =
         "{vis      | grid    | Captured boards visualisation (grid, window)}"
         "{d        | 0.8     | Min delay between captures}"
         "{pf       | defaultConfig.xml| Advanced application parameters}"
+        "{save_frames | false   | Save frames that contribute to final calibration}"
         "{help     |         | Print help}";
 
 bool calib::showOverlayMessage(const std::string& message)
diff --git a/apps/interactive-calibration/parametersController.cpp b/apps/interactive-calibration/parametersController.cpp
index 3bcf5b86e9..5659b0e469 100644
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@@ -89,6 +89,7 @@ bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
     mCapParams.captureDelay = parser.get<float>("d");
     mCapParams.squareSize = parser.get<float>("sz");
     mCapParams.templDst = parser.get<float>("dst");
+    mCapParams.saveFrames = parser.has("save_frames");
 
     if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))
         return false;

From ed9524e125ed4f60063e581f336a7d5d67a8a01c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B4taro=20Mukai?= <mukai.k1011k@outlook.jp>
Date: Sat, 11 Jun 2022 18:42:30 +0900
Subject: [PATCH 076/178] Missing import

---
 doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
index 17ced36b5a..535ada430a 100644
--- a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
+++ b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
@@ -232,7 +232,7 @@ The following is an adaptation of @ref tutorial_js_face_detection.
 @code{.js}
 const { Canvas, createCanvas, Image, ImageData, loadImage } = require('canvas');
 const { JSDOM } = require('jsdom');
-const { writeFileSync, readFileSync } = require('fs');
+const { writeFileSync, existsSync, mkdirSync } = require('fs');
 
 (async () => {
   await loadOpenCV();

From d09bd6f8622a770a8f1eff35a68b0aaa8b93ec32 Mon Sep 17 00:00:00 2001
From: Stefan Dragnev <stefan@scanbot.io>
Date: Fri, 10 Jun 2022 15:32:18 +0200
Subject: [PATCH 077/178] correct direction of displaymatrix rotation

---
 modules/videoio/src/cap_ffmpeg_impl.hpp | 2 +-
 modules/videoio/test/test_ffmpeg.cpp    | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index b7fa0b745c..98913ed839 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -1840,7 +1840,7 @@ void CvCapture_FFMPEG::get_rotation_angle()
     data = av_stream_get_side_data(video_st, AV_PKT_DATA_DISPLAYMATRIX, NULL);
     if (data)
     {
-        rotation_angle = cvRound(av_display_rotation_get((const int32_t*)data));
+        rotation_angle = -cvRound(av_display_rotation_get((const int32_t*)data));
         if (rotation_angle < 0)
             rotation_angle += 360;
     }
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 40ef586fd6..115705760c 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -413,6 +413,11 @@ TEST(videoio, mp4_orientation_meta_auto)
     EXPECT_NO_THROW(cap.open(video_file, CAP_FFMPEG));
     ASSERT_TRUE(cap.isOpened()) << "Can't open the video: " << video_file << " with backend " << CAP_FFMPEG << std::endl;
 
+#ifndef _WIN32 // TODO: FFmpeg wrapper update
+    // related issue: https://github.com/opencv/opencv/issues/22088
+    EXPECT_EQ(90, cap.get(CAP_PROP_ORIENTATION_META));
+#endif
+
     cap.set(CAP_PROP_ORIENTATION_AUTO, true);
     if (cap.get(CAP_PROP_ORIENTATION_AUTO) == 0)
         throw SkipTestException("FFmpeg frame rotation metadata is not supported");

From 7a46d7efde74b145f63e40b47ea8bd4d5be22097 Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Wed, 8 Jun 2022 16:29:47 +0200
Subject: [PATCH 078/178] Fix compilation on some ARM architecture.

This condition is the same as the line above.
---
 modules/core/src/system.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 9bade08177..adb957908d 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -598,13 +598,20 @@ struct HWFeatures
             close(cpufile);
         }
     #endif
-    #elif (defined __clang__ || defined __APPLE__)
+    #elif (defined __APPLE__)
     #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
         have[CV_CPU_NEON] = true;
     #endif
     #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
         have[CV_CPU_FP16] = true;
     #endif
+    #elif (defined __clang__)
+    #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+        have[CV_CPU_NEON] = true;
+        #if (defined __ARM_FP  && ((__ARM_FP & 0x2) != 0))
+        have[CV_CPU_FP16] = true;
+        #endif
+    #endif
     #endif
     #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
         have[CV_CPU_NEON] = true;

From f4a6c3e7ea5b2f7dcddf9db71a1f39a0f1161fa3 Mon Sep 17 00:00:00 2001
From: fengyuentau <yuantao.feng@opencv.org.cn>
Date: Wed, 15 Jun 2022 11:02:07 +0800
Subject: [PATCH 079/178] add workflows for macOS for 4.x

---
 .github/workflows/PR-4.x.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 980bf753de..dc44e501a6 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -15,5 +15,11 @@ jobs:
   W10:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
 
+  macOS-ARM64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-macOS-ARM64.yaml@main
+
+  macOS-X64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-macOS-x86_64.yaml@main
+
   TIM-VX:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main
\ No newline at end of file

From 572812217b193faef0e28ccc7854c6beee52b55c Mon Sep 17 00:00:00 2001
From: fengyuentau <yuantao.feng@opencv.org.cn>
Date: Wed, 15 Jun 2022 18:25:35 +0800
Subject: [PATCH 080/178] re-commit: add workflows for macOS for 3.4

---
 .github/workflows/PR-3.4.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index a1efca8c26..1e3282310e 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -13,4 +13,10 @@ jobs:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-U20.yaml@main
 
   W10:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-W10.yaml@main
\ No newline at end of file
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-W10.yaml@main
+
+  macOS-ARM64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-ARM64.yaml@main
+
+  macOS-X64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-x86_64.yaml@main
\ No newline at end of file

From 0d52c37e11121226eb7c8c356c369d202f00be5d Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Fri, 17 Jun 2022 11:39:50 +0200
Subject: [PATCH 081/178] Fix typo that prevents compilation with sanitizer.

---
 modules/core/src/hal_internal.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp
index 5239acc585..d2648fae81 100644
--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -67,7 +67,7 @@
 #if defined(__clang__) && defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 #define CV_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
-__msan_unpoison(adresse, size)
+__msan_unpoison(address, size)
 #endif
 #endif
 #ifndef CV_ANNOTATE_MEMORY_IS_INITIALIZED

From db5b22e89549687dfd28dc3f9005545df4c96f9b Mon Sep 17 00:00:00 2001
From: Christine Poerschke <cpoerschke@apache.org>
Date: Fri, 17 Jun 2022 12:48:30 +0100
Subject: [PATCH 082/178] Merge pull request #22065 from
 cpoerschke:3.4-apps-visualisation-configurable-codec

* apps/opencv_visualisation: configurable video codec

* Update apps/visualisation/opencv_visualisation.cpp
---
 apps/visualisation/opencv_visualisation.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/apps/visualisation/opencv_visualisation.cpp b/apps/visualisation/opencv_visualisation.cpp
index be540efa62..85e9697aad 100644
--- a/apps/visualisation/opencv_visualisation.cpp
+++ b/apps/visualisation/opencv_visualisation.cpp
@@ -86,6 +86,9 @@ int main( int argc, const char** argv )
         "{ image i        |      | (required) path to reference image }"
         "{ model m        |      | (required) path to cascade xml file }"
         "{ data d         |      | (optional) path to video output folder }"
+        "{ ext            | avi  | (optional) output video file extension e.g. avi (default) or mp4 }"
+        "{ fourcc         | XVID | (optional) output video file's 4-character codec e.g. XVID (default) or H264 }"
+        "{ fps            |   15 | (optional) output video file's frames-per-second rate }"
     );
     // Read in the input arguments
     if (parser.has("help")){
@@ -96,7 +99,9 @@ int main( int argc, const char** argv )
     string model(parser.get<string>("model"));
     string output_folder(parser.get<string>("data"));
     string image_ref = (parser.get<string>("image"));
-    if (model.empty() || image_ref.empty()){
+    string fourcc = (parser.get<string>("fourcc"));
+    int fps = parser.get<int>("fps");
+    if (model.empty() || image_ref.empty() || fourcc.size()!=4 || fps<1){
         parser.printMessage();
         printLimits();
         return -1;
@@ -166,11 +171,19 @@ int main( int argc, const char** argv )
     // each stage, containing all weak classifiers for that stage.
     bool draw_planes = false;
     stringstream output_video;
-    output_video << output_folder << "model_visualization.avi";
+    output_video << output_folder << "model_visualization." << parser.get<string>("ext");
     VideoWriter result_video;
     if( output_folder.compare("") != 0 ){
         draw_planes = true;
-        result_video.open(output_video.str(), VideoWriter::fourcc('X','V','I','D'), 15, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor), false);
+        result_video.open(output_video.str(), VideoWriter::fourcc(fourcc[0],fourcc[1],fourcc[2],fourcc[3]), fps, visualization.size(), false);
+        if (!result_video.isOpened()){
+            cerr << "the output video '" << output_video.str() << "' could not be opened."
+                 << " fourcc=" << fourcc
+                 << " fps=" << fps
+                 << " frameSize=" << visualization.size()
+                 << endl;
+            return -1;
+        }
     }
 
     if(haar){

From a04f9e7a59b83008255bdae40e3499efe0af9fb2 Mon Sep 17 00:00:00 2001
From: catree <catree.catreus@outlook.com>
Date: Sun, 19 Jun 2022 01:27:32 +0200
Subject: [PATCH 083/178] Add more references. Update missing references with
 webarchive. Use mathbf for matrices. Check that the determinant of the
 rotation matrix is not negative (reflection), and correct it if it is the
 case.

---
 doc/opencv.bib                                | 20 +++--
 .../features2d/homography/homography.markdown | 84 ++++++++++---------
 modules/calib3d/include/opencv2/calib3d.hpp   |  4 +-
 .../Homography/pose_from_homography.cpp       | 11 ++-
 4 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/doc/opencv.bib b/doc/opencv.bib
index 13fe71a1fb..dddb957995 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -543,7 +543,7 @@
   title = {Multiple view geometry in computer vision},
   year = {2003},
   publisher = {Cambridge university press},
-  url = {http://cds.cern.ch/record/1598612/files/0521540518_TOC.pdf}
+  url = {https://www.robots.ox.ac.uk/~vgg/hzbook/}
 }
 @article{Horaud95,
   author = {Horaud, Radu and Dornaika, Fadi},
@@ -745,10 +745,17 @@
   isbn = {0387008934},
   publisher = {Springer}
 }
-@article{Malis,
-  author = {Malis, Ezio and Vargas, Manuel and others},
-  title = {Deeper understanding of the homography decomposition for vision-based control},
-  year = {2007}
+@article{Malis2007,
+  author = {Malis, Ezio and Vargas, Manuel},
+  title = {{Deeper understanding of the homography decomposition for vision-based control}},
+  year = {2007},
+  url = {https://hal.inria.fr/inria-00174036},
+  type = {Research Report},
+  number = {RR-6303},
+  pages = {90},
+  institution = {{INRIA}},
+  keywords = {Visual servoing ; planar objects ; homography ; decomposition ; camera calibration errors ; structure from motion ; Euclidean},
+  pdf = {https://hal.inria.fr/inria-00174036v3/file/RR-6303.pdf},
 }
 @article{Marchand16,
   author = {Marchand, Eric and Uchiyama, Hideaki and Spindler, Fabien},
@@ -905,7 +912,8 @@
   author = {Szeliski, Richard},
   title = {Computer vision: algorithms and applications},
   year = {2010},
-  publisher = {Springer}
+  publisher = {Springer},
+  url = {https://szeliski.org/Book/}
 }
 @article{Rafael12,
   author = {von Gioi, Rafael Grompone and Jakubowicz, J{\'e}r{\'e}mie and Morel, Jean-Michel and Randall, Gregory},
diff --git a/doc/tutorials/features2d/homography/homography.markdown b/doc/tutorials/features2d/homography/homography.markdown
index 3d8f97c28b..e5009798c3 100644
--- a/doc/tutorials/features2d/homography/homography.markdown
+++ b/doc/tutorials/features2d/homography/homography.markdown
@@ -10,9 +10,11 @@ Introduction {#tutorial_homography_Introduction}
 
 This tutorial will demonstrate the basic concepts of the homography with some codes.
 For detailed explanations about the theory, please refer to a computer vision course or a computer vision book, e.g.:
-*   Multiple View Geometry in Computer Vision, @cite HartleyZ00.
-*   An Invitation to 3-D Vision: From Images to Geometric Models, @cite Ma:2003:IVI
-*   Computer Vision: Algorithms and Applications, @cite RS10
+*   Multiple View Geometry in Computer Vision, Richard Hartley and Andrew Zisserman, @cite HartleyZ00 (some sample chapters are available [here](https://www.robots.ox.ac.uk/~vgg/hzbook/), CVPR Tutorials are available [here](https://www.robots.ox.ac.uk/~az/tutorials/))
+*   An Invitation to 3-D Vision: From Images to Geometric Models, Yi Ma, Stefano Soatto, Jana Kosecka, and S. Shankar Sastry, @cite Ma:2003:IVI (a computer vision book handout is available [here](https://cs.gmu.edu/%7Ekosecka/cs685/VisionBookHandout.pdf))
+*   Computer Vision: Algorithms and Applications, Richard Szeliski, @cite RS10 (an electronic version is available [here](https://szeliski.org/Book/))
+*   Deeper understanding of the homography decomposition for vision-based control, Ezio Malis, Manuel Vargas, @cite Malis2007 (open access [here](https://hal.inria.fr/inria-00174036))
+*   Pose Estimation for Augmented Reality: A Hands-On Survey, Eric Marchand, Hideaki Uchiyama, Fabien Spindler, @cite Marchand16 (open access [here](https://hal.inria.fr/hal-01246370))
 
 The tutorial code can be found here [C++](https://github.com/opencv/opencv/tree/3.4/samples/cpp/tutorial_code/features2D/Homography),
 [Python](https://github.com/opencv/opencv/tree/3.4/samples/python/tutorial_code/features2D/Homography),
@@ -32,7 +34,7 @@ Briefly, the planar homography relates the transformation between two planes (up
   x^{'} \\
   y^{'} \\
   1
-  \end{bmatrix} = H
+  \end{bmatrix} = \mathbf{H}
   \begin{bmatrix}
   x \\
   y \\
@@ -123,22 +125,22 @@ A quick solution to retrieve the pose from the homography matrix is (see \ref po
 
 \f[
   \begin{align*}
-  \boldsymbol{X} &= \left( X, Y, 0, 1 \right ) \\
-  \boldsymbol{x} &= \boldsymbol{P}\boldsymbol{X} \\
-                 &= \boldsymbol{K} \left[ \boldsymbol{r_1} \hspace{0.5em} \boldsymbol{r_2} \hspace{0.5em} \boldsymbol{r_3} \hspace{0.5em} \boldsymbol{t} \right ]
+  \mathbf{X} &= \left( X, Y, 0, 1 \right ) \\
+  \mathbf{x} &= \mathbf{P}\mathbf{X} \\
+                 &= \mathbf{K} \left[ \mathbf{r_1} \hspace{0.5em} \mathbf{r_2} \hspace{0.5em} \mathbf{r_3} \hspace{0.5em} \mathbf{t} \right ]
   \begin{pmatrix}
   X \\
   Y \\
   0 \\
   1
   \end{pmatrix} \\
-             &= \boldsymbol{K} \left[ \boldsymbol{r_1} \hspace{0.5em} \boldsymbol{r_2} \hspace{0.5em} \boldsymbol{t} \right ]
+             &= \mathbf{K} \left[ \mathbf{r_1} \hspace{0.5em} \mathbf{r_2} \hspace{0.5em} \mathbf{t} \right ]
   \begin{pmatrix}
   X \\
   Y \\
   1
   \end{pmatrix} \\
-  &= \boldsymbol{H}
+  &= \mathbf{H}
   \begin{pmatrix}
   X \\
   Y \\
@@ -149,16 +151,16 @@ A quick solution to retrieve the pose from the homography matrix is (see \ref po
 
 \f[
   \begin{align*}
-  \boldsymbol{H} &= \lambda \boldsymbol{K} \left[ \boldsymbol{r_1} \hspace{0.5em} \boldsymbol{r_2} \hspace{0.5em} \boldsymbol{t} \right ] \\
-  \boldsymbol{K}^{-1} \boldsymbol{H} &= \lambda \left[ \boldsymbol{r_1} \hspace{0.5em} \boldsymbol{r_2} \hspace{0.5em} \boldsymbol{t} \right ] \\
-  \boldsymbol{P} &= \boldsymbol{K} \left[ \boldsymbol{r_1} \hspace{0.5em} \boldsymbol{r_2} \hspace{0.5em} \left( \boldsymbol{r_1} \times \boldsymbol{r_2} \right ) \hspace{0.5em} \boldsymbol{t} \right ]
+  \mathbf{H} &= \lambda \mathbf{K} \left[ \mathbf{r_1} \hspace{0.5em} \mathbf{r_2} \hspace{0.5em} \mathbf{t} \right ] \\
+  \mathbf{K}^{-1} \mathbf{H} &= \lambda \left[ \mathbf{r_1} \hspace{0.5em} \mathbf{r_2} \hspace{0.5em} \mathbf{t} \right ] \\
+  \mathbf{P} &= \mathbf{K} \left[ \mathbf{r_1} \hspace{0.5em} \mathbf{r_2} \hspace{0.5em} \left( \mathbf{r_1} \times \mathbf{r_2} \right ) \hspace{0.5em} \mathbf{t} \right ]
   \end{align*}
 \f]
 
 This is a quick solution (see also \ref projective_transformations "2") as this does not ensure that the resulting rotation matrix will be orthogonal and the scale is estimated roughly by normalize the first column to 1.
 
-A solution to have a proper rotation matrix (with the properties of a rotation matrix) consists to apply a polar decomposition
-(see \ref polar_decomposition "6" or \ref polar_decomposition_svd "7" for some information):
+A solution to have a proper rotation matrix (with the properties of a rotation matrix) consists to apply a polar decomposition, or orthogonalization of the rotation matrix
+(see \ref polar_decomposition "6" or \ref polar_decomposition_svd "7" or \ref polar_decomposition_svd_2 "8" or \ref Kabsch_algorithm "9" for some information):
 
 @snippet pose_from_homography.cpp polar-decomposition-of-the-rotation-matrix
 
@@ -239,7 +241,7 @@ To check the correctness of the calculation, the matching lines are displayed:
 
 ### Demo 3: Homography from the camera displacement {#tutorial_homography_Demo3}
 
-The homography relates the transformation between two planes and it is possible to retrieve the corresponding camera displacement that allows to go from the first to the second plane view (see @cite Malis for more information).
+The homography relates the transformation between two planes and it is possible to retrieve the corresponding camera displacement that allows to go from the first to the second plane view (see @cite Malis2007 for more information).
 Before going into the details that allow to compute the homography from the camera displacement, some recalls about camera pose and homogeneous transformation.
 
 The function @ref cv::solvePnP allows to compute the camera pose from the correspondences 3D object points (points expressed in the object frame) and the projected 2D image points (object points viewed in the image).
@@ -269,7 +271,7 @@ The intrinsic parameters and the distortion coefficients are required (see the c
   Z_o \\
   1
   \end{bmatrix} \\
-  &= \boldsymbol{K} \hspace{0.2em} ^{c}\textrm{M}_o
+  &= \mathbf{K} \hspace{0.2em} ^{c}\mathbf{M}_o
   \begin{bmatrix}
   X_o \\
   Y_o \\
@@ -279,9 +281,9 @@ The intrinsic parameters and the distortion coefficients are required (see the c
   \end{align*}
 \f]
 
-\f$ \boldsymbol{K} \f$ is the intrinsic matrix and \f$ ^{c}\textrm{M}_o \f$ is the camera pose. The output of @ref cv::solvePnP is exactly this: `rvec` is the Rodrigues rotation vector and `tvec` the translation vector.
+\f$ \mathbf{K} \f$ is the intrinsic matrix and \f$ ^{c}\mathbf{M}_o \f$ is the camera pose. The output of @ref cv::solvePnP is exactly this: `rvec` is the Rodrigues rotation vector and `tvec` the translation vector.
 
-\f$ ^{c}\textrm{M}_o \f$ can be represented in a homogeneous form and allows to transform a point expressed in the object frame into the camera frame:
+\f$ ^{c}\mathbf{M}_o \f$ can be represented in a homogeneous form and allows to transform a point expressed in the object frame into the camera frame:
 
 \f[
   \begin{align*}
@@ -291,7 +293,7 @@ The intrinsic parameters and the distortion coefficients are required (see the c
   Z_c \\
   1
   \end{bmatrix} &=
-  \hspace{0.2em} ^{c}\textrm{M}_o
+  \hspace{0.2em} ^{c}\mathbf{M}_o
   \begin{bmatrix}
   X_o \\
   Y_o \\
@@ -300,7 +302,7 @@ The intrinsic parameters and the distortion coefficients are required (see the c
   \end{bmatrix} \\
   &=
   \begin{bmatrix}
-  ^{c}\textrm{R}_o & ^{c}\textrm{t}_o \\
+  ^{c}\mathbf{R}_o & ^{c}\mathbf{t}_o \\
   0_{1\times3} & 1
   \end{bmatrix}
   \begin{bmatrix}
@@ -327,19 +329,19 @@ The intrinsic parameters and the distortion coefficients are required (see the c
 
 Transform a point expressed in one frame to another frame can be easily done with matrix multiplication:
 
-*   \f$ ^{c_1}\textrm{M}_o \f$ is the camera pose for the camera 1
-*   \f$ ^{c_2}\textrm{M}_o \f$ is the camera pose for the camera 2
+*   \f$ ^{c_1}\mathbf{M}_o \f$ is the camera pose for the camera 1
+*   \f$ ^{c_2}\mathbf{M}_o \f$ is the camera pose for the camera 2
 
 To transform a 3D point expressed in the camera 1 frame to the camera 2 frame:
 
 \f[
-  ^{c_2}\textrm{M}_{c_1} = \hspace{0.2em} ^{c_2}\textrm{M}_{o} \cdot \hspace{0.1em} ^{o}\textrm{M}_{c_1} = \hspace{0.2em} ^{c_2}\textrm{M}_{o} \cdot \hspace{0.1em} \left( ^{c_1}\textrm{M}_{o} \right )^{-1} =
+  ^{c_2}\mathbf{M}_{c_1} = \hspace{0.2em} ^{c_2}\mathbf{M}_{o} \cdot \hspace{0.1em} ^{o}\mathbf{M}_{c_1} = \hspace{0.2em} ^{c_2}\mathbf{M}_{o} \cdot \hspace{0.1em} \left( ^{c_1}\mathbf{M}_{o} \right )^{-1} =
   \begin{bmatrix}
-  ^{c_2}\textrm{R}_{o} & ^{c_2}\textrm{t}_{o} \\
+  ^{c_2}\mathbf{R}_{o} & ^{c_2}\mathbf{t}_{o} \\
   0_{3 \times 1} & 1
   \end{bmatrix} \cdot
   \begin{bmatrix}
-  ^{c_1}\textrm{R}_{o}^T & - \hspace{0.2em} ^{c_1}\textrm{R}_{o}^T \cdot \hspace{0.2em} ^{c_1}\textrm{t}_{o} \\
+  ^{c_1}\mathbf{R}_{o}^T & - \hspace{0.2em} ^{c_1}\mathbf{R}_{o}^T \cdot \hspace{0.2em} ^{c_1}\mathbf{t}_{o} \\
   0_{1 \times 3} & 1
   \end{bmatrix}
 \f]
@@ -362,11 +364,11 @@ On this figure, `n` is the normal vector of the plane and `d` the distance betwe
 The [equation](https://en.wikipedia.org/wiki/Homography_(computer_vision)#3D_plane_to_plane_equation) to compute the homography from the camera displacement is:
 
 \f[
-  ^{2}\textrm{H}_{1} = \hspace{0.2em} ^{2}\textrm{R}_{1} - \hspace{0.1em} \frac{^{2}\textrm{t}_{1} \cdot n^T}{d}
+  ^{2}\mathbf{H}_{1} = \hspace{0.2em} ^{2}\mathbf{R}_{1} - \hspace{0.1em} \frac{^{2}\mathbf{t}_{1} \cdot \hspace{0.1em} ^{1}\mathbf{n}^\top}{^1d}
 \f]
 
-Where \f$ ^{2}\textrm{H}_{1} \f$ is the homography matrix that maps the points in the first camera frame to the corresponding points in the second camera frame, \f$ ^{2}\textrm{R}_{1} = \hspace{0.2em} ^{c_2}\textrm{R}_{o} \cdot \hspace{0.1em} ^{c_1}\textrm{R}_{o}^{T} \f$
-is the rotation matrix that represents the rotation between the two camera frames and \f$ ^{2}\textrm{t}_{1} = \hspace{0.2em} ^{c_2}\textrm{R}_{o} \cdot \left( - \hspace{0.1em} ^{c_1}\textrm{R}_{o}^{T} \cdot \hspace{0.1em} ^{c_1}\textrm{t}_{o} \right ) + \hspace{0.1em} ^{c_2}\textrm{t}_{o} \f$
+Where \f$ ^{2}\mathbf{H}_{1} \f$ is the homography matrix that maps the points in the first camera frame to the corresponding points in the second camera frame, \f$ ^{2}\mathbf{R}_{1} = \hspace{0.2em} ^{c_2}\mathbf{R}_{o} \cdot \hspace{0.1em} ^{c_1}\mathbf{R}_{o}^{\top} \f$
+is the rotation matrix that represents the rotation between the two camera frames and \f$ ^{2}\mathbf{t}_{1} = \hspace{0.2em} ^{c_2}\mathbf{R}_{o} \cdot \left( - \hspace{0.1em} ^{c_1}\mathbf{R}_{o}^{\top} \cdot \hspace{0.1em} ^{c_1}\mathbf{t}_{o} \right ) + \hspace{0.1em} ^{c_2}\mathbf{t}_{o} \f$
 the translation vector between the two camera frames.
 
 Here the normal vector `n` is the plane normal expressed in the camera frame 1 and can be computed as the cross product of 2 vectors (using 3 non collinear points that lie on the plane) or in our case directly with:
@@ -377,7 +379,7 @@ The distance `d` can be computed as the dot product between the plane normal and
 
 @snippet homography_from_camera_displacement.cpp compute-plane-distance-to-the-camera-frame-1
 
-The projective homography matrix \f$ \textbf{G} \f$ can be computed from the Euclidean homography \f$ \textbf{H} \f$ using the intrinsic matrix \f$ \textbf{K} \f$ (see @cite Malis), here assuming the same camera between the two plane views:
+The projective homography matrix \f$ \textbf{G} \f$ can be computed from the Euclidean homography \f$ \textbf{H} \f$ using the intrinsic matrix \f$ \textbf{K} \f$ (see @cite Malis2007), here assuming the same camera between the two plane views:
 
 \f[
   \textbf{G} = \gamma \textbf{K} \textbf{H} \textbf{K}^{-1}
@@ -388,7 +390,7 @@ The projective homography matrix \f$ \textbf{G} \f$ can be computed from the Euc
 In our case, the Z-axis of the chessboard goes inside the object whereas in the homography figure it goes outside. This is just a matter of sign:
 
 \f[
-  ^{2}\textrm{H}_{1} = \hspace{0.2em} ^{2}\textrm{R}_{1} + \hspace{0.1em} \frac{^{2}\textrm{t}_{1} \cdot n^T}{d}
+  ^{2}\mathbf{H}_{1} = \hspace{0.2em} ^{2}\mathbf{R}_{1} + \hspace{0.1em} \frac{^{2}\mathbf{t}_{1} \cdot \hspace{0.1em} ^{1}\mathbf{n}^\top}{^1d}
 \f]
 
 @snippet homography_from_camera_displacement.cpp compute-homography-from-camera-displacement
@@ -466,8 +468,8 @@ As you can see, there is one solution that matches almost perfectly with the com
 At least two of the solutions may further be invalidated if point correspondences are available by applying positive depth constraint (all points must be in front of the camera).
 ```
 
-As the result of the decomposition is a camera displacement, if we have the initial camera pose \f$ ^{c_1}\textrm{M}_{o} \f$, we can compute the current camera pose
-\f$ ^{c_2}\textrm{M}_{o} = \hspace{0.2em} ^{c_2}\textrm{M}_{c_1} \cdot \hspace{0.1em} ^{c_1}\textrm{M}_{o} \f$ and test if the 3D object points that belong to the plane are projected in front of the camera or not.
+As the result of the decomposition is a camera displacement, if we have the initial camera pose \f$ ^{c_1}\mathbf{M}_{o} \f$, we can compute the current camera pose
+\f$ ^{c_2}\mathbf{M}_{o} = \hspace{0.2em} ^{c_2}\mathbf{M}_{c_1} \cdot \hspace{0.1em} ^{c_1}\mathbf{M}_{o} \f$ and test if the 3D object points that belong to the plane are projected in front of the camera or not.
 Another solution could be to retain the solution with the closest normal if we know the plane normal expressed at the camera 1 pose.
 
 The same thing but with the homography matrix estimated with @ref cv::findHomography
@@ -516,7 +518,7 @@ The [stitching module](@ref stitching) provides a complete pipeline to stitch im
 The homography transformation applies only for planar structure. But in the case of a rotating camera (pure rotation around the camera axis of projection, no translation), an arbitrary world can be considered
 ([see previously](@ref tutorial_homography_What_is_the_homography_matrix)).
 
-The homography can then be computed using the rotation transformation and the camera intrinsic parameters as (see for instance \ref homography_course "8"):
+The homography can then be computed using the rotation transformation and the camera intrinsic parameters as (see for instance \ref homography_course "10"):
 
 \f[
   s
@@ -534,7 +536,7 @@ The homography can then be computed using the rotation transformation and the ca
 \f]
 
 To illustrate, we used Blender, a free and open-source 3D computer graphics software, to generate two camera views with only a rotation transformation between each other.
-More information about how to retrieve the camera intrinsic parameters and the `3x4` extrinsic matrix with respect to the world can be found in \ref answer_blender "9" (an additional transformation
+More information about how to retrieve the camera intrinsic parameters and the `3x4` extrinsic matrix with respect to the world can be found in \ref answer_blender "11" (an additional transformation
 is needed to get the transformation between the camera and the object frames) with Blender.
 
 The figure below shows the two generated views of the Suzanne model, with only a rotation transformation:
@@ -603,11 +605,13 @@ Additional references {#tutorial_homography_Additional_references}
 ---------------------
 
 *   \anchor lecture_16 1. [Lecture 16: Planar Homographies](http://www.cse.psu.edu/~rtc12/CSE486/lecture16.pdf), Robert Collins
-*   \anchor projective_transformations 2. [2D projective transformations (homographies)](https://ags.cs.uni-kl.de/fileadmin/inf_ags/3dcv-ws11-12/3DCV_WS11-12_lec04.pdf), Christiano Gava, Gabriele Bleser
-*   \anchor szeliski 3. [Computer Vision: Algorithms and Applications](http://szeliski.org/Book/drafts/SzeliskiBook_20100903_draft.pdf), Richard Szeliski
+*   \anchor projective_transformations 2. [2D projective transformations (homographies)](https://web.archive.org/web/20171226115739/https://ags.cs.uni-kl.de/fileadmin/inf_ags/3dcv-ws11-12/3DCV_WS11-12_lec04.pdf), Christiano Gava, Gabriele Bleser
+*   \anchor szeliski 3. [Computer Vision: Algorithms and Applications](https://szeliski.org/Book/), Richard Szeliski
 *   \anchor answer_dsp 4. [Step by Step Camera Pose Estimation for Visual Tracking and Planar Markers](https://dsp.stackexchange.com/a/2737)
-*   \anchor pose_ar 5. [Pose from homography estimation](https://team.inria.fr/lagadic/camera_localization/tutorial-pose-dlt-planar-opencv.html)
+*   \anchor pose_ar 5. [Pose from homography estimation](https://visp-doc.inria.fr/doxygen/camera_localization/tutorial-pose-dlt-planar-opencv.html)
 *   \anchor polar_decomposition 6. [Polar Decomposition (in Continuum Mechanics)](http://www.continuummechanics.org/polardecomposition.html)
-*   \anchor polar_decomposition_svd 7. [A Personal Interview with the Singular Value Decomposition](https://web.stanford.edu/~gavish/documents/SVD_ans_you.pdf), Matan Gavish
-*   \anchor homography_course 8. [Homography](http://people.scs.carleton.ca/~c_shu/Courses/comp4900d/notes/homography.pdf), Dr. Gerhard Roth
-*   \anchor answer_blender 9. [3x4 camera matrix from blender camera](https://blender.stackexchange.com/a/38210)
+*   \anchor polar_decomposition_svd 7. [Chapter 3 - 3.1.2 From matrices to rotations - Theorem 3.1 (Least-squares estimation of a rotation from a matrix K)](https://www-sop.inria.fr/asclepios/cours/MVA/Rotations.pdf)
+*   \anchor polar_decomposition_svd_2 8. [A Personal Interview with the Singular Value Decomposition](https://web.stanford.edu/~gavish/documents/SVD_ans_you.pdf), Matan Gavish
+*   \anchor Kabsch_algorithm 9. [Kabsch algorithm, Computation of the optimal rotation matrix](https://en.wikipedia.org/wiki/Kabsch_algorithm#Computation_of_the_optimal_rotation_matrix)
+*   \anchor homography_course 10. [Homography](http://people.scs.carleton.ca/~c_shu/Courses/comp4900d/notes/homography.pdf), Dr. Gerhard Roth
+*   \anchor answer_blender 11. [3x4 camera matrix from blender camera](https://blender.stackexchange.com/a/38210)
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 63eded027f..99458e86fa 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -2559,7 +2559,7 @@ Check @ref tutorial_homography "the corresponding tutorial" for more details.
 
 This function extracts relative camera motion between two views of a planar object and returns up to
 four mathematical solution tuples of rotation, translation, and plane normal. The decomposition of
-the homography matrix H is described in detail in @cite Malis.
+the homography matrix H is described in detail in @cite Malis2007.
 
 If the homography H, induced by the plane, gives the constraint
 \f[s_i \vecthree{x'_i}{y'_i}{1} \sim H \vecthree{x_i}{y_i}{1}\f] on the source image points
@@ -2587,7 +2587,7 @@ CV_EXPORTS_W int decomposeHomographyMat(InputArray H,
 @param pointsMask optional Mat/Vector of 8u type representing the mask for the inliers as given by the findHomography function
 
 This function is intended to filter the output of the decomposeHomographyMat based on additional
-information as described in @cite Malis . The summary of the method: the decomposeHomographyMat function
+information as described in @cite Malis2007 . The summary of the method: the decomposeHomographyMat function
 returns 2 unique solutions and their "opposites" for a total of 4 solutions. If we have access to the
 sets of points visible in the camera frame before and after the homography transformation is applied,
 we can determine which are the true potential solutions and which are the opposites by verifying which
diff --git a/samples/cpp/tutorial_code/features2D/Homography/pose_from_homography.cpp b/samples/cpp/tutorial_code/features2D/Homography/pose_from_homography.cpp
index 37ebcac7be..ae86fcfaa2 100644
--- a/samples/cpp/tutorial_code/features2D/Homography/pose_from_homography.cpp
+++ b/samples/cpp/tutorial_code/features2D/Homography/pose_from_homography.cpp
@@ -109,9 +109,18 @@ void poseEstimationFromCoplanarPoints(const string &imgPath, const string &intri
 
     //! [polar-decomposition-of-the-rotation-matrix]
     cout << "R (before polar decomposition):\n" << R << "\ndet(R): " << determinant(R) << endl;
-    Mat W, U, Vt;
+    Mat_<double> W, U, Vt;
     SVDecomp(R, W, U, Vt);
     R = U*Vt;
+    double det = determinant(R);
+    if (det < 0)
+    {
+        Vt.at<double>(2,0) *= -1;
+        Vt.at<double>(2,1) *= -1;
+        Vt.at<double>(2,2) *= -1;
+
+        R = U*Vt;
+    }
     cout << "R (after polar decomposition):\n" << R << "\ndet(R): " << determinant(R) << endl;
     //! [polar-decomposition-of-the-rotation-matrix]
 

From dd7b9000ada172fa3c8954a89339dcf6249b8a59 Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Tue, 21 Jun 2022 03:42:50 +0900
Subject: [PATCH 084/178] Merge pull request #22064 from Kumataro:3.4-fix22052

* imgcodecs: jpeg: add IMWRITE_JPEG_SAMPLING_FACTOR parameter

* fix compile error

* imgcodecs: jpeg: add CV_LOG_WARNING() and fix how to initilize Mat

* imgcodecs: jpeg: fix for C++98 mode.

* samples: imgcodec_jpeg: Remove license
---
 .../imgcodecs/include/opencv2/imgcodecs.hpp   | 11 +++
 modules/imgcodecs/src/grfmt_jpeg.cpp          | 32 +++++++
 modules/imgcodecs/test/test_jpeg.cpp          | 92 +++++++++++++++++++
 samples/cpp/imgcodecs_jpeg.cpp                | 82 +++++++++++++++++
 4 files changed, 217 insertions(+)
 create mode 100644 samples/cpp/imgcodecs_jpeg.cpp

diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index ba63e9934b..49f76826b9 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -89,6 +89,7 @@ enum ImwriteFlags {
        IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
        IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use.
        IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use.
+       IMWRITE_JPEG_SAMPLING_FACTOR = 7, //!< For JPEG, set sampling factor. See cv::ImwriteJPEGSamplingFactorParams.
        IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. If specified, strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY). Default value is 1 (best speed setting).
        IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_RLE.
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
@@ -102,12 +103,22 @@ enum ImwriteFlags {
        IMWRITE_TIFF_COMPRESSION    = 259 //!< For TIFF, use to specify the image compression scheme. See libtiff for integer constants corresponding to compression formats. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
      };
 
+enum ImwriteJPEGSamplingFactorParams {
+       IMWRITE_JPEG_SAMPLING_FACTOR_411 = 0x411111, //!< 4x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_420 = 0x221111, //!< 2x2,1x1,1x1(Default)
+       IMWRITE_JPEG_SAMPLING_FACTOR_422 = 0x211111, //!< 2x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_440 = 0x121111, //!< 1x2,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_444 = 0x111111  //!< 1x1,1x1,1x1(No subsampling)
+     };
+
+
 enum ImwriteEXRTypeFlags {
        /*IMWRITE_EXR_TYPE_UNIT = 0, //!< not supported */
        IMWRITE_EXR_TYPE_HALF   = 1, //!< store as HALF (FP16)
        IMWRITE_EXR_TYPE_FLOAT  = 2  //!< store as FP32 (default)
      };
 
+
 //! Imwrite PNG specific flags used to tune the compression algorithm.
 /** These flags will be modify the way of PNG image compression and will be passed to the underlying zlib processing stage.
 
diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index d9e056f1a8..9500e196f5 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -44,6 +44,8 @@
 
 #ifdef HAVE_JPEG
 
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef _MSC_VER
 //interaction between '_setjmp' and C++ object destruction is non-portable
 #pragma warning(disable: 4611)
@@ -640,6 +642,7 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
         int rst_interval = 0;
         int luma_quality = -1;
         int chroma_quality = -1;
+        uint32_t sampling_factor = 0; // same as 0x221111
 
         for( size_t i = 0; i < params.size(); i += 2 )
         {
@@ -687,6 +690,27 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                 rst_interval = params[i+1];
                 rst_interval = MIN(MAX(rst_interval, 0), 65535L);
             }
+
+            if( params[i] == IMWRITE_JPEG_SAMPLING_FACTOR )
+            {
+                sampling_factor = static_cast<uint32_t>(params[i+1]);
+
+                switch ( sampling_factor )
+                {
+                    case IMWRITE_JPEG_SAMPLING_FACTOR_411:
+                    case IMWRITE_JPEG_SAMPLING_FACTOR_420:
+                    case IMWRITE_JPEG_SAMPLING_FACTOR_422:
+                    case IMWRITE_JPEG_SAMPLING_FACTOR_440:
+                    case IMWRITE_JPEG_SAMPLING_FACTOR_444:
+                    // OK.
+                    break;
+
+                    default:
+                    CV_LOG_WARNING(NULL, cv::format("Unknown value for IMWRITE_JPEG_SAMPLING_FACTOR: 0x%06x", sampling_factor ) );
+                    sampling_factor = 0;
+                    break;
+                }
+            }
         }
 
         jpeg_set_defaults( &cinfo );
@@ -699,6 +723,14 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
         if( optimize )
             cinfo.optimize_coding = TRUE;
 
+        if( (channels > 1) && ( sampling_factor != 0 ) )
+        {
+            cinfo.comp_info[0].v_samp_factor = (sampling_factor >> 16 ) & 0xF;
+            cinfo.comp_info[0].h_samp_factor = (sampling_factor >> 20 ) & 0xF;
+            cinfo.comp_info[1].v_samp_factor = 1;
+            cinfo.comp_info[1].h_samp_factor = 1;
+        }
+
 #if JPEG_LIB_VERSION >= 70
         if (luma_quality >= 0 && chroma_quality >= 0)
         {
diff --git a/modules/imgcodecs/test/test_jpeg.cpp b/modules/imgcodecs/test/test_jpeg.cpp
index 9b516a9aba..b7932a0020 100644
--- a/modules/imgcodecs/test/test_jpeg.cpp
+++ b/modules/imgcodecs/test/test_jpeg.cpp
@@ -178,6 +178,98 @@ TEST(Imgcodecs_Jpeg, encode_decode_rst_jpeg)
     EXPECT_EQ(0, remove(output_normal.c_str()));
 }
 
+//==================================================================================================
+
+static const uint32_t default_sampling_factor = static_cast<uint32_t>(0x221111);
+
+static uint32_t test_jpeg_subsampling( const Mat src, const vector<int> param )
+{
+    vector<uint8_t> jpeg;
+
+    if ( cv::imencode(".jpg", src, jpeg, param ) == false )
+    {
+        return 0;
+    }
+
+    if ( src.channels() != 3 )
+    {
+        return 0;
+    }
+
+    // Find SOF Marker(FFC0)
+    int sof_offset = 0; // not found.
+    int jpeg_size = static_cast<int>( jpeg.size() );
+    for ( int i = 0 ; i < jpeg_size - 1; i++ )
+    {
+        if ( (jpeg[i] == 0xff ) && ( jpeg[i+1] == 0xC0 ) )
+        {
+            sof_offset = i;
+            break;
+        }
+    }
+    if ( sof_offset == 0 )
+    {
+        return 0;
+    }
+
+    // Extract Subsampling Factor from SOF.
+    return ( jpeg[sof_offset + 0x0A + 3 * 0 + 1] << 16 ) +
+           ( jpeg[sof_offset + 0x0A + 3 * 1 + 1] << 8  ) +
+           ( jpeg[sof_offset + 0x0A + 3 * 2 + 1]       ) ;
+}
+
+TEST(Imgcodecs_Jpeg, encode_subsamplingfactor_default)
+{
+    vector<int> param;
+    Mat src( 48, 64, CV_8UC3, cv::Scalar::all(0) );
+    EXPECT_EQ( default_sampling_factor, test_jpeg_subsampling(src, param) );
+}
+
+TEST(Imgcodecs_Jpeg, encode_subsamplingfactor_usersetting_valid)
+{
+    Mat src( 48, 64, CV_8UC3, cv::Scalar::all(0) );
+    const uint32_t sampling_factor_list[] = {
+        IMWRITE_JPEG_SAMPLING_FACTOR_411,
+        IMWRITE_JPEG_SAMPLING_FACTOR_420,
+        IMWRITE_JPEG_SAMPLING_FACTOR_422,
+        IMWRITE_JPEG_SAMPLING_FACTOR_440,
+        IMWRITE_JPEG_SAMPLING_FACTOR_444,
+    };
+    const int sampling_factor_list_num = 5;
+
+    for ( int i = 0 ; i < sampling_factor_list_num; i ++ )
+    {
+        vector<int> param;
+        param.push_back( IMWRITE_JPEG_SAMPLING_FACTOR );
+        param.push_back( sampling_factor_list[i] );
+        EXPECT_EQ( sampling_factor_list[i], test_jpeg_subsampling(src, param) );
+    }
+}
+
+TEST(Imgcodecs_Jpeg, encode_subsamplingfactor_usersetting_invalid)
+{
+    Mat src( 48, 64, CV_8UC3, cv::Scalar::all(0) );
+    const uint32_t sampling_factor_list[] = { // Invalid list
+        0x111112,
+        0x000000,
+        0x001111,
+        0xFF1111,
+        0x141111, // 1x4,1x1,1x1 - unknown
+        0x241111, // 2x4,1x1,1x1 - unknown
+        0x421111, // 4x2,1x1,1x1 - unknown
+        0x441111, // 4x4,1x1,1x1 - 410(libjpeg cannot handle it)
+    };
+    const int sampling_factor_list_num = 8;
+
+    for ( int i = 0 ; i < sampling_factor_list_num; i ++ )
+    {
+        vector<int> param;
+        param.push_back( IMWRITE_JPEG_SAMPLING_FACTOR );
+        param.push_back( sampling_factor_list[i] );
+        EXPECT_EQ( default_sampling_factor, test_jpeg_subsampling(src, param) );
+    }
+}
+
 #endif // HAVE_JPEG
 
 }} // namespace
diff --git a/samples/cpp/imgcodecs_jpeg.cpp b/samples/cpp/imgcodecs_jpeg.cpp
new file mode 100644
index 0000000000..b3abc49286
--- /dev/null
+++ b/samples/cpp/imgcodecs_jpeg.cpp
@@ -0,0 +1,82 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <iostream>
+#include <vector>
+
+using namespace std;
+using namespace cv;
+
+int main(int /*argc*/, const char** /* argv */ )
+{
+    Mat framebuffer( 160 * 2, 160 * 5, CV_8UC3, cv::Scalar::all(255) );
+
+    Mat img( 160, 160, CV_8UC3, cv::Scalar::all(255) );
+
+    // Create test image.
+    {
+        const Point center( img.rows / 2 , img.cols /2 );
+
+        for( int radius = 5; radius < img.rows ; radius += 3.5 )
+        {
+            cv::circle( img, center, radius, Scalar(255,0,255) );
+        }
+        cv::rectangle( img, Point(0,0), Point(img.rows-1, img.cols-1), Scalar::all(0), 2 );
+    }
+
+    // Draw original image(s).
+    int top = 0; // Upper images
+    {
+        for( int left = 0 ; left < img.rows * 5 ; left += img.rows ){
+            Mat roi = framebuffer( Rect( left, top, img.rows, img.cols ) );
+            img.copyTo(roi);
+
+            cv::putText( roi, "original", Point(5,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar::all(0), 2, 4, false );
+        }
+    }
+
+    // Draw lossy images
+    top += img.cols; // Lower images
+    {
+        struct test_config{
+            string comment;
+            uint32_t sampling_factor;
+        } config [] = {
+            { "411", IMWRITE_JPEG_SAMPLING_FACTOR_411 },
+            { "420", IMWRITE_JPEG_SAMPLING_FACTOR_420 },
+            { "422", IMWRITE_JPEG_SAMPLING_FACTOR_422 },
+            { "440", IMWRITE_JPEG_SAMPLING_FACTOR_440 },
+            { "444", IMWRITE_JPEG_SAMPLING_FACTOR_444 },
+        };
+
+        const int config_num = 5;
+
+        int left = 0;
+
+        for ( int i = 0 ; i < config_num; i++ )
+        {
+            // Compress images with sampling factor parameter.
+            vector<int> param;
+            param.push_back( IMWRITE_JPEG_SAMPLING_FACTOR );
+            param.push_back( config[i].sampling_factor );
+            vector<uint8_t> jpeg;
+            (void) imencode(".jpg", img, jpeg, param );
+
+            // Decompress it.
+            Mat jpegMat(jpeg);
+            Mat lossy_img = imdecode(jpegMat, -1);
+
+            // Copy into framebuffer and comment
+            Mat roi = framebuffer( Rect( left, top, lossy_img.rows, lossy_img.cols ) );
+            lossy_img.copyTo(roi);
+            cv::putText( roi, config[i].comment, Point(5,155), FONT_HERSHEY_SIMPLEX, 0.5, Scalar::all(0), 2, 4, false );
+
+            left += lossy_img.rows;
+        }
+    }
+
+    // Output framebuffer(as lossless).
+    imwrite( "imgcodecs_jpeg_samplingfactor_result.png", framebuffer );
+
+    return 0;
+}

From 2411b825b45d91440b25cce5d0b99dcf115cc8d8 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 22 Jun 2022 15:00:17 +0800
Subject: [PATCH 085/178] bug fixed of GEMM node in ONNX_importer

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 10 ++++++----
 modules/dnn/test/test_onnx_importer.cpp |  5 +++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index f06ff32dbe..7e035b3f23 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -2080,15 +2080,17 @@ void ONNXImporter::parseBatchNormalization(LayerParams& layerParams, const openc
     addLayer(layerParams, node_proto);
 }
 
+// A * B + C = Y, we require that the dimension of A is [m, k], and the dimension of B is [n, k].
+// And the dim of output Y is [m, n]
 void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     CV_Assert(node_proto.input_size() >= 2);
     layerParams.type = "InnerProduct";
     Mat weights = getBlob(node_proto, 1);
-    int ind_num_out = 0;
-    if (layerParams.has("transB") && !layerParams.get<int>("transB")) {
+
+    if (!layerParams.get<int>("transB", 0))
+    {
         transpose(weights, weights);
-        ind_num_out = 1;
     }
     layerParams.blobs.push_back(weights);
 
@@ -2110,7 +2112,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
         addLayer(constParams, proto);
     }
 
-    layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
+    layerParams.set("num_output", layerParams.blobs[0].size[0]);
     layerParams.set("bias_term", node_proto.input_size() == 3);
     addLayer(layerParams, node_proto);
 }
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 50540cd043..bf59cbba97 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1746,6 +1746,11 @@ TEST_P(Test_ONNX_layers, DivConst)
     testONNXModels("div_const");
 }
 
+TEST_P(Test_ONNX_layers, Gemm)
+{
+    testONNXModels("gemm_no_transB");
+    testONNXModels("gemm_transB_0");
+}
 
 TEST_P(Test_ONNX_layers, Quantized_Convolution)
 {

From a6ca48a1c22a2f0e7e3b493b78bd2ac4a5a7acf0 Mon Sep 17 00:00:00 2001
From: Wanli <zhongwl2018@mail.sustech.edu.cn>
Date: Wed, 22 Jun 2022 19:21:16 +0800
Subject: [PATCH 086/178] Merge pull request #22100 from WanliZhong:issue_22015

Fix issue 22015, let Clip layer support 1-3 inputs

* Fix issue 22015.
Let layer Clip support 1-3 inputs.

* Resolve other problems caused by modifications

* Update onnx_importer.cpp

added extra checks to min/max handling in Clip

* Add assertions to check the size of the input

* Add test for clip with min and max initializers

* Separate test for "clip_init_min_max". Change the check method for input_size to provide a clearer message in case of problem.

* Add tests for clip with min or max initializers

* Change the implementation of getting input

Co-authored-by: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
---
 modules/dnn/src/onnx/onnx_importer.cpp  | 25 ++++++++++++++++++++++---
 modules/dnn/test/test_onnx_importer.cpp |  7 +++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 7e035b3f23..15e6919f19 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1931,10 +1931,29 @@ void ONNXImporter::parseImageScaler(LayerParams& layerParams, const opencv_onnx:
 
 void ONNXImporter::parseClip(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
-    CV_CheckEQ(node_proto.input_size(), 1, "");
     layerParams.type = "ReLU6";
-    layerParams.set("min_value", layerParams.get<float>("min", -FLT_MAX));
-    layerParams.set("max_value", layerParams.get<float>("max", FLT_MAX));
+    float min_value = -FLT_MAX, max_value = FLT_MAX;
+    int input_size = node_proto.input_size();
+    CV_Check(input_size, 1 <= input_size && input_size <= 3, "");
+
+    if (input_size >= 2 && !node_proto.input(1).empty())
+    {
+        if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
+            min_value = getBlob(node_proto, 1).at<float>(0);
+        else
+            CV_Error(Error::StsNotImplemented, "Non-constant min values in Clip are not supported");
+    }
+
+    if (input_size == 3 && !node_proto.input(2).empty())
+    {
+        if (constBlobs.find(node_proto.input(2)) != constBlobs.end())
+            max_value = getBlob(node_proto, 2).at<float>(0);
+        else
+            CV_Error(Error::StsNotImplemented, "Non-constant max values in Clip are not supported");
+    }
+
+    layerParams.set("min_value", layerParams.get<float>("min", min_value));
+    layerParams.set("max_value", layerParams.get<float>("max", max_value));
     addLayer(layerParams, node_proto);
 }
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index bf59cbba97..578e0442b2 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -389,6 +389,13 @@ TEST_P(Test_ONNX_layers, Clip)
     testONNXModels("clip", npy);
 }
 
+TEST_P(Test_ONNX_layers, Clip_init)
+{
+    testONNXModels("clip_init_min_max");
+    testONNXModels("clip_init_min");
+    testONNXModels("clip_init_max");
+}
+
 TEST_P(Test_ONNX_layers, Shape)
 {
     testONNXModels("shape_of_constant");

From ef94275eb67bbf662f2d2597f1d513b5b772756b Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 22 Jun 2022 21:08:48 +0800
Subject: [PATCH 087/178] bug fixed of GEMM node in ONNX_importer

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 8 ++++----
 modules/dnn/test/test_onnx_importer.cpp | 6 ++++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index e4cbe02840..7a0532fcf4 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1759,15 +1759,15 @@ void ONNXImporter::parseBatchNormalization(LayerParams& layerParams, const openc
     addLayer(layerParams, node_proto);
 }
 
+// A * B + C = Y, we require that the dimension of A is [m, k], and the dimension of B is [n, k].
+// And the dim of output Y is [m, n]
 void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     CV_Assert(node_proto.input_size() >= 2);
     layerParams.type = "InnerProduct";
     Mat weights = getBlob(node_proto, 1);
-    int ind_num_out = 0;
-    if (layerParams.has("transB") && !layerParams.get<int>("transB")) {
+    if (!layerParams.get<int>("transB", 0)) {
         transpose(weights, weights);
-        ind_num_out = 1;
     }
     layerParams.blobs.push_back(weights);
 
@@ -1789,7 +1789,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
         addLayer(constParams, proto);
     }
 
-    layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
+    layerParams.set("num_output", layerParams.blobs[0].size[0]);
     layerParams.set("bias_term", node_proto.input_size() == 3);
     addLayer(layerParams, node_proto);
 }
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 60473ede58..56203cba56 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1389,6 +1389,12 @@ TEST_P(Test_ONNX_layers, DivConst)
     testONNXModels("div_const");
 }
 
+TEST_P(Test_ONNX_layers, Gemm)
+{
+    testONNXModels("gemm_no_transB");
+    testONNXModels("gemm_transB_0");
+}
+
 TEST_P(Test_ONNX_layers, OutputRegistration)
 {
     testONNXModels("output_registration", npy, 0, 0, false, true, 2);

From 82010bf5c105a3e6f1c2fbf73e7ec164c7a96235 Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Fri, 24 Jun 2022 11:31:54 +0200
Subject: [PATCH 088/178] Fix some overflows in drawing.

---
 modules/imgproc/src/drawing.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 65735bd9c3..81d56075ca 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -1159,7 +1159,7 @@ FillConvexPoly( Mat& img, const Point2l* v, int npts, const void* color, int lin
     edge[0].x = edge[1].x = -XY_ONE;
     edge[0].dx = edge[1].dx = 0;
 
-    ptr += img.step*y;
+    ptr += (int64_t)img.step*y;
 
     do
     {
@@ -1188,7 +1188,7 @@ FillConvexPoly( Mat& img, const Point2l* v, int npts, const void* color, int lin
                             }
 
                             edge[i].ye = ty;
-                            edge[i].dx = ((xe - xs)*2 + (ty - y)) / (2 * (ty - y));
+                            edge[i].dx = ((xe - xs)*2 + ((int64_t)ty - y)) / (2 * ((int64_t)ty - y));
                             edge[i].x = xs;
                             edge[i].idx = idx;
                             break;
@@ -1461,7 +1461,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
     size_t step = img.step;
     int pix_size = (int)img.elemSize();
     uchar* ptr = img.ptr();
-    int err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1;
+    int64_t err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1;
     int inside = center.x >= radius && center.x < size.width - radius &&
         center.y >= radius && center.y < size.height - radius;
 
@@ -1471,8 +1471,8 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
     while( dx >= dy )
     {
         int mask;
-        int y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx;
-        int x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy;
+        int64_t y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx;
+        int64_t x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy;
 
         if( inside )
         {
@@ -1512,7 +1512,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
         {
             if( fill )
             {
-                x11 = std::max( x11, 0 );
+                x11 = std::max( x11, (int64_t)0 );
                 x12 = MIN( x12, size.width - 1 );
             }
 
@@ -1550,7 +1550,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
             {
                 if( fill )
                 {
-                    x21 = std::max( x21, 0 );
+                    x21 = std::max( x21, (int64_t)0 );
                     x22 = MIN( x22, size.width - 1 );
                 }
 
@@ -1848,6 +1848,11 @@ void rectangle( Mat& img, Rect rec,
     CV_INSTRUMENT_REGION();
 
     CV_Assert( 0 <= shift && shift <= XY_SHIFT );
+
+    // Crop the rectangle to right around the mat.
+    rec &= Rect(-(1 << shift), -(1 << shift), ((img.cols + 2) << shift),
+                ((img.rows + 2) << shift));
+
     if( !rec.empty() )
         rectangle( img, rec.tl(), rec.br() - Point(1<<shift,1<<shift),
                    color, thickness, lineType, shift );

From b5adffd5c22e9c749b89864a4273bbea99c385ba Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 24 Jun 2022 14:58:32 +0300
Subject: [PATCH 089/178] * cleaned cvRound(), cvFloor() and cvCeil()
 implementations, removed the old non-banking rounding branch completely *
 enable the use of GCC/clang __builtin_*() functions more broadly

---
 .../core/include/opencv2/core/fast_math.hpp   | 66 +++++++------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index eb4fbe213b..cd5de0b546 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -128,12 +128,8 @@
     #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
   #endif
 
-  #if !defined(OPENCV_USE_FASTMATH_BUILTINS) \
-    && ( \
-        defined(__x86_64__) || defined(__i686__) \
-        || defined(__arm__) \
-        || defined(__PPC64__) \
-    )
+  #if !defined(OPENCV_USE_FASTMATH_BUILTINS) && \
+      (defined __GNUC__ || defined __clang__ || defined _MSC_VER)
     /* Let builtin C math functions when available. Dedicated hardware is available to
        round and convert FP values. */
     #define OPENCV_USE_FASTMATH_BUILTINS 1
@@ -201,9 +197,7 @@ cvRound( double value )
 {
 #if defined CV_INLINE_ROUND_DBL
     CV_INLINE_ROUND_DBL(value);
-#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
-    && !defined(__CUDACC__)
+#elif (defined _MSC_VER && defined _M_X64) && !defined(__CUDACC__)
     __m128d t = _mm_set_sd( value );
     return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -214,12 +208,11 @@ cvRound( double value )
         fistp t;
     }
     return t;
-#elif defined CV_ICC || defined __GNUC__
-    return (int)(lrint(value));
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrint(value);
 #else
-    /* it's ok if round does not comply with IEEE754 standard;
-       the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    return (int)lrint(value);
 #endif
 }
 
@@ -233,11 +226,9 @@ cvRound( double value )
  */
 CV_INLINE int cvFloor( double value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_floor(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floor(value);
 #else
     int i = (int)value;
     return i - (i > value);
@@ -253,11 +244,9 @@ CV_INLINE int cvFloor( double value )
  */
 CV_INLINE int cvCeil( double value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_ceil(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceil(value);
 #else
     int i = (int)value;
     return i + (i < value);
@@ -312,9 +301,7 @@ CV_INLINE int cvRound(float value)
 {
 #if defined CV_INLINE_ROUND_FLT
     CV_INLINE_ROUND_FLT(value);
-#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
-    && !defined(__CUDACC__)
+#elif (defined _MSC_VER && defined _M_X64) && !defined(__CUDACC__)
     __m128 t = _mm_set_ss( value );
     return _mm_cvtss_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -325,12 +312,11 @@ CV_INLINE int cvRound(float value)
         fistp t;
     }
     return t;
-#elif defined CV_ICC || defined __GNUC__
-    return (int)(lrintf(value));
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrintf(value);
 #else
-    /* it's ok if round does not comply with IEEE754 standard;
-     the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+    return (int)lrintf(value);
 #endif
 }
 
@@ -343,11 +329,9 @@ CV_INLINE int cvRound( int value )
 /** @overload */
 CV_INLINE int cvFloor( float value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_floorf(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floorf(value);
 #else
     int i = (int)value;
     return i - (i > value);
@@ -363,11 +347,9 @@ CV_INLINE int cvFloor( int value )
 /** @overload */
 CV_INLINE int cvCeil( float value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_ceilf(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceilf(value);
 #else
     int i = (int)value;
     return i + (i < value);

From 9faefa0c96960dee08e13d50ee1257797861058c Mon Sep 17 00:00:00 2001
From: lamm45 <96844552+lamm45@users.noreply.github.com>
Date: Fri, 24 Jun 2022 13:49:40 -0400
Subject: [PATCH 090/178] Fix minor errors in the first documentation page
 (Introduction)

This markdown file corresponds to the first link on the landing
page of OpenCV documentation.
---
 modules/core/doc/intro.markdown | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/modules/core/doc/intro.markdown b/modules/core/doc/intro.markdown
index 901c11d706..c78ccbfbb1 100644
--- a/modules/core/doc/intro.markdown
+++ b/modules/core/doc/intro.markdown
@@ -71,7 +71,7 @@ data sharing. A destructor decrements the reference counter associated with the
 The buffer is deallocated if and only if the reference counter reaches zero, that is, when no other
 structures refer to the same buffer. Similarly, when a Mat instance is copied, no actual data is
 really copied. Instead, the reference counter is incremented to memorize that there is another owner
-of the same data. There is also the Mat::clone method that creates a full copy of the matrix data.
+of the same data. There is also the cv::Mat::clone method that creates a full copy of the matrix data.
 See the example below:
 ```.cpp
     // create a big 8Mb matrix
@@ -159,11 +159,11 @@ grayscale conversion. Note that frame and edges are allocated only once during t
 of the loop body since all the next video frames have the same resolution. If you somehow change the
 video resolution, the arrays are automatically reallocated.
 
-The key component of this technology is the Mat::create method. It takes the desired array size and
-type. If the array already has the specified size and type, the method does nothing. Otherwise, it
-releases the previously allocated data, if any (this part involves decrementing the reference
+The key component of this technology is the cv::Mat::create method. It takes the desired array size
+and type. If the array already has the specified size and type, the method does nothing. Otherwise,
+it releases the previously allocated data, if any (this part involves decrementing the reference
 counter and comparing it with zero), and then allocates a new buffer of the required size. Most
-functions call the Mat::create method for each output array, and so the automatic output data
+functions call the cv::Mat::create method for each output array, and so the automatic output data
 allocation is implemented.
 
 Some notable exceptions from this scheme are cv::mixChannels, cv::RNG::fill, and a few other
@@ -247,9 +247,9 @@ Examples:
                                // matrix (10-element complex vector)
     Mat img(Size(1920, 1080), CV_8UC3); // make a 3-channel (color) image
                                         // of 1920 columns and 1080 rows.
-    Mat grayscale(image.size(), CV_MAKETYPE(image.depth(), 1)); // make a 1-channel image of
-                                                                // the same size and same
-                                                                // channel type as img
+    Mat grayscale(img.size(), CV_MAKETYPE(img.depth(), 1)); // make a 1-channel image of
+                                                            // the same size and same
+                                                            // channel type as img
 ```
 Arrays with more complex elements cannot be constructed or processed using OpenCV. Furthermore, each
 function or method can handle only a subset of all possible array types. Usually, the more complex
@@ -269,14 +269,14 @@ extended in future based on user requests.
 ### InputArray and OutputArray
 
 Many OpenCV functions process dense 2-dimensional or multi-dimensional numerical arrays. Usually,
-such functions take cppMat as parameters, but in some cases it's more convenient to use
+such functions take `cv::Mat` as parameters, but in some cases it's more convenient to use
 `std::vector<>` (for a point set, for example) or `cv::Matx<>` (for 3x3 homography matrix and such). To
 avoid many duplicates in the API, special "proxy" classes have been introduced. The base "proxy"
 class is cv::InputArray. It is used for passing read-only arrays on a function input. The derived from
 InputArray class cv::OutputArray is used to specify an output array for a function. Normally, you should
 not care of those intermediate types (and you should not declare variables of those types
 explicitly) - it will all just work automatically. You can assume that instead of
-InputArray/OutputArray you can always use `Mat`, `std::vector<>`, `cv::Matx<>`, `cv::Vec<>` or `cv::Scalar`. When a
+InputArray/OutputArray you can always use `cv::Mat`, `std::vector<>`, `cv::Matx<>`, `cv::Vec<>` or `cv::Scalar`. When a
 function has an optional input or output array, and you do not have or do not want one, pass
 cv::noArray().
 
@@ -288,7 +288,7 @@ the optimization algorithm did not converge), it returns a special error code (t
 boolean variable).
 
 The exceptions can be instances of the cv::Exception class or its derivatives. In its turn,
-cv::Exception is a derivative of std::exception. So it can be gracefully handled in the code using
+cv::Exception is a derivative of `std::exception`. So it can be gracefully handled in the code using
 other standard C++ library components.
 
 The exception is typically thrown either using the `#CV_Error(errcode, description)` macro, or its
@@ -297,7 +297,7 @@ printf-like `#CV_Error_(errcode, (printf-spec, printf-args))` variant, or using
 satisfied. For performance-critical code, there is #CV_DbgAssert(condition) that is only retained in
 the Debug configuration. Due to the automatic memory management, all the intermediate buffers are
 automatically deallocated in case of a sudden error. You only need to add a try statement to catch
-exceptions, if needed: :
+exceptions, if needed:
 ```.cpp
     try
     {

From 35f1a90df7e5a9b3b275a74868759efd787a8c70 Mon Sep 17 00:00:00 2001
From: Sean McBride <sean@rogue-research.com>
Date: Fri, 24 Jun 2022 23:48:22 -0400
Subject: [PATCH 091/178] Merge pull request #22149 from seanm:sprintf

Replaced sprintf with safer snprintf

* Straightforward replacement of sprintf with safer snprintf

* Trickier replacement of sprintf with safer snprintf

Some functions were changed to take another parameter: the size of the buffer, so that they can pass that size on to snprintf.
---
 apps/traincascade/boost.cpp                   |  2 +-
 apps/traincascade/cascadeclassifier.cpp       | 12 +--
 apps/traincascade/old_ml_inner_functions.cpp  |  2 +-
 apps/traincascade/old_ml_tree.cpp             |  6 +-
 modules/core/src/arithm.cpp                   |  4 +-
 modules/core/src/out.cpp                      | 20 ++---
 modules/core/src/persistence.cpp              | 26 +++---
 modules/core/src/persistence.hpp              |  4 +-
 modules/core/src/persistence_json.cpp         |  2 +-
 modules/core/src/persistence_xml.cpp          |  2 +-
 modules/core/src/persistence_yml.cpp          |  8 +-
 modules/core/src/system.cpp                   |  2 +-
 modules/core/test/test_io.cpp                 |  6 +-
 modules/core/test/test_mat.cpp                |  2 +-
 modules/imgproc/src/box_filter.dispatch.cpp   |  4 +-
 modules/imgproc/src/deriv.cpp                 |  4 +-
 modules/imgproc/src/filter.dispatch.cpp       |  2 +-
 modules/imgproc/src/smooth.dispatch.cpp       |  2 +-
 modules/imgproc/test/test_imgwarp_strict.cpp  |  2 +-
 modules/python/src2/pycompat.hpp              |  4 +-
 modules/ts/src/ts.cpp                         |  4 +-
 modules/ts/src/ts_func.cpp                    |  8 +-
 modules/video/test/test_optflowpyrlk.cpp      |  8 +-
 modules/videoio/src/cap_dshow.cpp             | 85 +++++++++----------
 modules/videoio/src/cap_gphoto2.cpp           |  4 +-
 modules/videoio/src/cap_ximea.cpp             |  2 +-
 modules/videoio/src/cap_xine.cpp              |  2 +-
 samples/cpp/calibration.cpp                   |  2 +-
 samples/cpp/select3dobj.cpp                   |  4 +-
 .../HighGUI/AddingImagesTrackbar.cpp          |  2 +-
 .../ImgTrans/HoughLines_Demo.cpp              |  2 +-
 samples/cpp/videocapture_starter.cpp          |  2 +-
 samples/directx/d3d9_interop.cpp              |  8 +-
 samples/directx/d3d9ex_interop.cpp            |  8 +-
 34 files changed, 126 insertions(+), 131 deletions(-)

diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index ee0486ec90..8f0d394a0e 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -1677,7 +1677,7 @@ void CvCascadeBoost::write( FileStorage &fs, const Mat& featureMap ) const
     fs << CC_WEAK_CLASSIFIERS << "[";
     for( int wi = 0; wi < weak->total; wi++)
     {
-        /*sprintf( cmnt, "tree %i", wi );
+        /*snprintf( cmnt, sizeof(cmnt), "tree %i", wi );
         cvWriteComment( fs, cmnt, 0 );*/
         weakTree = *((CvCascadeBoostTree**) cvGetSeqElem( weak, wi ));
         weakTree->write( fs, featureMap );
diff --git a/apps/traincascade/cascadeclassifier.cpp b/apps/traincascade/cascadeclassifier.cpp
index 5a83746bc4..d43625d879 100644
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@@ -253,7 +253,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
         }
         // save current stage
         char buf[32];
-        sprintf(buf, "%s%d", "stage", i );
+        snprintf(buf, sizeof(buf), "%s%d", "stage", i );
         string stageFilename = dirName + buf + ".xml";
         FileStorage fs( stageFilename, FileStorage::WRITE );
         if ( !fs.isOpened() )
@@ -369,7 +369,7 @@ void CvCascadeClassifier::writeStages( FileStorage &fs, const Mat& featureMap )
     for( vector< Ptr<CvCascadeBoost> >::const_iterator it = stageClassifiers.begin();
         it != stageClassifiers.end();++it, ++i )
     {
-        sprintf( cmnt, "stage %d", i );
+        snprintf( cmnt, sizeof(cmnt), "stage %d", i );
         cvWriteComment( fs.fs, cmnt, 0 );
         fs << "{";
         (*it)->write( fs, featureMap );
@@ -460,7 +460,7 @@ void CvCascadeClassifier::save( const string filename, bool baseFormat )
         for( size_t si = 0; si < stageClassifiers.size(); si++ )
         {
             fs << "{"; //stage
-            /*sprintf( buf, "stage %d", si );
+            /*snprintf( buf, sizeof(buf), "stage %d", si );
             CV_CALL( cvWriteComment( fs, buf, 1 ) );*/
             weak = stageClassifiers[si]->get_weak_predictors();
             fs << ICV_HAAR_TREES_NAME << "[";
@@ -471,7 +471,7 @@ void CvCascadeClassifier::save( const string filename, bool baseFormat )
                 CvCascadeBoostTree* tree = *((CvCascadeBoostTree**) cvGetSeqElem( weak, wi ));
 
                 fs << "[";
-                /*sprintf( buf, "tree %d", wi );
+                /*snprintf( buf, sizeof(buf), "tree %d", wi );
                 CV_CALL( cvWriteComment( fs, buf, 1 ) );*/
 
                 const CvDTreeNode* tempNode;
@@ -535,10 +535,10 @@ bool CvCascadeClassifier::load( const string cascadeDirName )
     featureEvaluator->init( featureParams, numPos + numNeg, cascadeParams.winSize );
     fs.release();
 
-    char buf[16] = {0};
+    char buf[5+10+1] = {0};
     for ( int si = 0; si < numStages; si++ )
     {
-        sprintf( buf, "%s%d", "stage", si);
+        snprintf( buf, sizeof(buf), "%s%d", "stage", si);
         fs.open( cascadeDirName + buf + ".xml", FileStorage::READ );
         node = fs.getFirstTopLevelNode();
         if ( !fs.isOpened() )
diff --git a/apps/traincascade/old_ml_inner_functions.cpp b/apps/traincascade/old_ml_inner_functions.cpp
index 5858a4014a..3657c7f8b5 100644
--- a/apps/traincascade/old_ml_inner_functions.cpp
+++ b/apps/traincascade/old_ml_inner_functions.cpp
@@ -520,7 +520,7 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
             if( ri != rf )
             {
                 char buf[100];
-                sprintf( buf, "response #%d is not integral", idx );
+                snprintf( buf, sizeof(buf), "response #%d is not integral", idx );
                 CV_ERROR( CV_StsBadArg, buf );
             }
             dst[i] = ri;
diff --git a/apps/traincascade/old_ml_tree.cpp b/apps/traincascade/old_ml_tree.cpp
index cfba9ec952..96ea93351d 100644
--- a/apps/traincascade/old_ml_tree.cpp
+++ b/apps/traincascade/old_ml_tree.cpp
@@ -442,7 +442,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
                         val = cvRound(t);
                         if( fabs(t - val) > FLT_EPSILON )
                         {
-                            sprintf( err, "%d-th value of %d-th (categorical) "
+                            snprintf( err, sizeof(err), "%d-th value of %d-th (categorical) "
                                 "variable is not an integer", i, vi );
                             CV_ERROR( CV_StsBadArg, err );
                         }
@@ -450,7 +450,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
 
                     if( val == INT_MAX )
                     {
-                        sprintf( err, "%d-th value of %d-th (categorical) "
+                        snprintf( err, sizeof(err), "%d-th value of %d-th (categorical) "
                             "variable is too large", i, vi );
                         CV_ERROR( CV_StsBadArg, err );
                     }
@@ -557,7 +557,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
 
                     if( fabs(val) >= ord_nan )
                     {
-                        sprintf( err, "%d-th value of %d-th (ordered) "
+                        snprintf( err, sizeof(err), "%d-th value of %d-th (ordered) "
                             "variable (=%g) is too large", i, vi, val );
                         CV_ERROR( CV_StsBadArg, err );
                     }
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 96bfb25290..604b13101d 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -92,7 +92,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     const int dstType1 = CV_MAKETYPE(dstDepth, 1);
     const int scalarType = CV_MAKETYPE(srcdepth, scalarcn);
 
-    sprintf(opts, "-D %s%s -D %s%s -D dstT=%s -D DEPTH_dst=%d -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
+    snprintf(opts, sizeof(opts), "-D %s%s -D %s%s -D dstT=%s -D DEPTH_dst=%d -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
             haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
             bitwise ? ocl::memopTypeToStr(dstType) : ocl::typeToStr(dstType),
@@ -490,7 +490,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
 
     char cvtstr[4][32], opts[1024];
-    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
+    snprintf(opts, sizeof(opts), "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
             "-D dstT=%s -D DEPTH_dst=%d -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
             "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index aad8429e2d..8a7d7e1636 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -70,14 +70,14 @@ namespace cv
         char braces[5];
 
         void (FormattedImpl::*valueToStr)();
-        void valueToStr8u()  { sprintf(buf, "%3d", (int)mtx.ptr<uchar>(row, col)[cn]); }
-        void valueToStr8s()  { sprintf(buf, "%3d", (int)mtx.ptr<schar>(row, col)[cn]); }
-        void valueToStr16u() { sprintf(buf, "%d", (int)mtx.ptr<ushort>(row, col)[cn]); }
-        void valueToStr16s() { sprintf(buf, "%d", (int)mtx.ptr<short>(row, col)[cn]); }
-        void valueToStr32s() { sprintf(buf, "%d", mtx.ptr<int>(row, col)[cn]); }
-        void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr<float>(row, col)[cn]); }
-        void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr<double>(row, col)[cn]); }
-        void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
+        void valueToStr8u()  { snprintf(buf, sizeof(buf), "%3d", (int)mtx.ptr<uchar>(row, col)[cn]); }
+        void valueToStr8s()  { snprintf(buf, sizeof(buf), "%3d", (int)mtx.ptr<schar>(row, col)[cn]); }
+        void valueToStr16u() { snprintf(buf, sizeof(buf), "%d", (int)mtx.ptr<ushort>(row, col)[cn]); }
+        void valueToStr16s() { snprintf(buf, sizeof(buf), "%d", (int)mtx.ptr<short>(row, col)[cn]); }
+        void valueToStr32s() { snprintf(buf, sizeof(buf), "%d", mtx.ptr<int>(row, col)[cn]); }
+        void valueToStr32f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<float>(row, col)[cn]); }
+        void valueToStr64f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<double>(row, col)[cn]); }
+        void valueToStr16f() { snprintf(buf, sizeof(buf), floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
         void valueToStrOther() { buf[0] = 0; }
 
     public:
@@ -151,10 +151,10 @@ namespace cv
                         }
                         else
                             row = 0;
-                        sprintf(buf, "\n(:, :, %d) = \n", cn+1);
+                        snprintf(buf, sizeof(buf), "\n(:, :, %d) = \n", cn+1);
                         return buf;
                     }
-                    sprintf(buf, "(:, :, %d) = \n", cn+1);
+                    snprintf(buf, sizeof(buf), "(:, :, %d) = \n", cn+1);
                     return buf;
                 case STATE_EPILOGUE:
                     state = STATE_FINISHED;
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 09435b5850..59f843d5bc 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -56,7 +56,7 @@ char* itoa( int _val, char* buffer, int /*radix*/ )
     return ptr;
 }
 
-char* doubleToString( char* buf, double value, bool explicitZero )
+char* doubleToString( char* buf, size_t bufSize, double value, bool explicitZero )
 {
     Cv64suf val;
     unsigned ieee754_hi;
@@ -70,15 +70,15 @@ char* doubleToString( char* buf, double value, bool explicitZero )
         if( ivalue == value )
         {
             if( explicitZero )
-                sprintf( buf, "%d.0", ivalue );
+                snprintf( buf, bufSize, "%d.0", ivalue );
             else
-                sprintf( buf, "%d.", ivalue );
+                snprintf( buf, bufSize, "%d.", ivalue );
         }
         else
         {
             static const char* fmt = "%.16e";
             char* ptr = buf;
-            sprintf( buf, fmt, value );
+            snprintf( buf, bufSize, fmt, value );
             if( *ptr == '+' || *ptr == '-' )
                 ptr++;
             for( ; cv_isdigit(*ptr); ptr++ )
@@ -99,7 +99,7 @@ char* doubleToString( char* buf, double value, bool explicitZero )
     return buf;
 }
 
-char* floatToString( char* buf, float value, bool halfprecision, bool explicitZero )
+char* floatToString( char* buf, size_t bufSize, float value, bool halfprecision, bool explicitZero )
 {
     Cv32suf val;
     unsigned ieee754;
@@ -112,17 +112,17 @@ char* floatToString( char* buf, float value, bool halfprecision, bool explicitZe
         if( ivalue == value )
         {
             if( explicitZero )
-                sprintf( buf, "%d.0", ivalue );
+                snprintf( buf, bufSize, "%d.0", ivalue );
             else
-                sprintf( buf, "%d.", ivalue );
+                snprintf( buf, bufSize, "%d.", ivalue );
         }
         else
         {
             char* ptr = buf;
             if (halfprecision)
-                sprintf(buf, "%.4e", value);
+                snprintf(buf, bufSize, "%.4e", value);
             else
-                sprintf(buf, "%.8e", value);
+                snprintf(buf, bufSize, "%.8e", value);
             if( *ptr == '+' || *ptr == '-' )
                 ptr++;
             for( ; cv_isdigit(*ptr); ptr++ )
@@ -585,7 +585,7 @@ bool FileStorage::Impl::open(const char *filename_or_buf, int _flags, const char
 
                     CV_Assert(strlen(encoding) < 1000);
                     char buf[1100];
-                    sprintf(buf, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding);
+                    snprintf(buf, sizeof(buf), "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding);
                     puts(buf);
                 } else
                     puts("<?xml version=\"1.0\"?>\n");
@@ -1107,15 +1107,15 @@ void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, s
                         data += sizeof(int);
                         break;
                     case CV_32F:
-                        ptr = fs::floatToString(buf, *(float *) data, false, explicitZero);
+                        ptr = fs::floatToString(buf, sizeof(buf), *(float *) data, false, explicitZero);
                         data += sizeof(float);
                         break;
                     case CV_64F:
-                        ptr = fs::doubleToString(buf, *(double *) data, explicitZero);
+                        ptr = fs::doubleToString(buf, sizeof(buf), *(double *) data, explicitZero);
                         data += sizeof(double);
                         break;
                     case CV_16F: /* reference */
-                        ptr = fs::floatToString(buf, (float) *(float16_t *) data, true, explicitZero);
+                        ptr = fs::floatToString(buf, sizeof(buf), (float) *(float16_t *) data, true, explicitZero);
                         data += sizeof(float16_t);
                         break;
                     default:
diff --git a/modules/core/src/persistence.hpp b/modules/core/src/persistence.hpp
index 1a9dbecf7c..fb7f5acf45 100644
--- a/modules/core/src/persistence.hpp
+++ b/modules/core/src/persistence.hpp
@@ -86,8 +86,8 @@ namespace fs
 {
 int strcasecmp(const char* str1, const char* str2);
 char* itoa( int _val, char* buffer, int /*radix*/ );
-char* floatToString( char* buf, float value, bool halfprecision, bool explicitZero );
-char* doubleToString( char* buf, double value, bool explicitZero );
+char* floatToString( char* buf, size_t bufSize, float value, bool halfprecision, bool explicitZero );
+char* doubleToString( char* buf, size_t bufSize, double value, bool explicitZero );
 
 int calcStructSize( const char* dt, int initial_size );
 int calcElemSize( const char* dt, int initial_size );
diff --git a/modules/core/src/persistence_json.cpp b/modules/core/src/persistence_json.cpp
index 12a58e80bf..e201215401 100644
--- a/modules/core/src/persistence_json.cpp
+++ b/modules/core/src/persistence_json.cpp
@@ -84,7 +84,7 @@ public:
     void write( const char* key, double value )
     {
         char buf[128];
-        writeScalar( key, fs::doubleToString( buf, value, true ));
+        writeScalar( key, fs::doubleToString( buf, sizeof(buf), value, true ));
     }
 
     void write(const char* key, const char* str, bool quote)
diff --git a/modules/core/src/persistence_xml.cpp b/modules/core/src/persistence_xml.cpp
index f664476fbc..2264d16baa 100644
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@@ -148,7 +148,7 @@ public:
     void write( const char* key, double value )
     {
         char buf[128];
-        writeScalar( key, fs::doubleToString( buf, value, false ) );
+        writeScalar( key, fs::doubleToString( buf, sizeof(buf), value, false ) );
     }
 
     void write(const char* key, const char* str, bool quote)
diff --git a/modules/core/src/persistence_yml.cpp b/modules/core/src/persistence_yml.cpp
index 5219ee076b..c8c7c7a892 100644
--- a/modules/core/src/persistence_yml.cpp
+++ b/modules/core/src/persistence_yml.cpp
@@ -40,7 +40,7 @@ public:
         {
             /* reset struct flag. in order not to print ']' */
             struct_flags = FileNode::SEQ;
-            sprintf(buf, "!!binary |");
+            snprintf(buf, sizeof(buf), "!!binary |");
             data = buf;
         }
         else if( FileNode::isFlow(struct_flags))
@@ -49,7 +49,7 @@ public:
             struct_flags |= FileNode::FLOW;
 
             if( type_name )
-                sprintf( buf, "!!%s %c", type_name, c );
+                snprintf( buf, sizeof(buf), "!!%s %c", type_name, c );
             else
             {
                 buf[0] = c;
@@ -59,7 +59,7 @@ public:
         }
         else if( type_name )
         {
-            sprintf( buf, "!!%s", type_name );
+            snprintf( buf, sizeof(buf), "!!%s", type_name );
             data = buf;
         }
 
@@ -110,7 +110,7 @@ public:
     void write( const char* key, double value )
     {
         char buf[128];
-        writeScalar( key, fs::doubleToString( buf, value, false ));
+        writeScalar( key, fs::doubleToString( buf, sizeof(buf), value, false ));
     }
 
     void write(const char* key, const char* str, bool quote)
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 4c79c357b5..cac5cd11f1 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -1385,7 +1385,7 @@ CV_IMPL const char* cvErrorStr( int status )
     case CV_OpenGlApiCallError :     return "OpenGL API call";
     };
 
-    sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);
+    snprintf(buf, sizeof(buf), "Unknown %s code %d", status >= 0 ? "status":"error", status);
     return buf;
 }
 
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 04426e58f7..5e1f6d7a8e 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -487,7 +487,7 @@ TEST(Core_InputOutput, FileStorage)
     cv::FileStorage f(file, cv::FileStorage::WRITE);
 
     char arr[66];
-    sprintf(arr, "sprintf is hell %d", 666);
+    snprintf(arr, sizeof(arr), "snprintf is hell %d", 666);
     EXPECT_NO_THROW(f << arr);
 }
 
@@ -1765,8 +1765,8 @@ TEST(Core_InputOutput, FileStorage_JSON_VeryLongLines)
         std::string val;
         for(int i = 0; i < 52500; i += 100)
         {
-            sprintf(key, "KEY%d", i);
-            sprintf(val0, "VALUE%d", i);
+            snprintf(key, sizeof(key), "KEY%d", i);
+            snprintf(val0, sizeof(val0), "VALUE%d", i);
             fs[key] >> val;
             ASSERT_EQ(val, val0);
         }
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 61ba306ba6..68de691260 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -161,7 +161,7 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
         getMatTypeStr( dstType, dstTypeStr );
         const char* dimStr = dim == 0 ? "ROWS" : "COLS";
 
-        sprintf( msg, "bad accuracy with srcType = %s, dstType = %s, opType = %s, dim = %s",
+        snprintf( msg, sizeof(msg), "bad accuracy with srcType = %s, dstType = %s, opType = %s, dim = %s",
                 srcTypeStr.c_str(), dstTypeStr.c_str(), opTypeStr, dimStr );
         ts->printf( cvtest::TS::LOG, msg );
         return cvtest::TS::FAIL_BAD_ACCURACY;
diff --git a/modules/imgproc/src/box_filter.dispatch.cpp b/modules/imgproc/src/box_filter.dispatch.cpp
index c9ec693385..698f2ae2fb 100644
--- a/modules/imgproc/src/box_filter.dispatch.cpp
+++ b/modules/imgproc/src/box_filter.dispatch.cpp
@@ -89,7 +89,7 @@ static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth
     globalsize[1] = size.height / 2;
 
     char build_opts[1024];
-    sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
+    snprintf(build_opts, sizeof(build_opts), "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
 
     ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
     if (kernel.empty())
@@ -196,7 +196,7 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
         globalsize[0] = roundUp(globalsize[0], wgRound);
 
         char build_options[1024], cvt[2][40];
-        sprintf(build_options, "-D cn=%d "
+        snprintf(build_options, sizeof(build_options), "-D cn=%d "
                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 2d8b59926d..fb7e7786f5 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -384,7 +384,7 @@ static bool ocl_sepFilter3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth,
 
     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
     char build_opts[1024];
-    sprintf(build_opts, "-D %s %s%s", borderMap[borderType],
+    snprintf(build_opts, sizeof(build_opts), "-D %s %s%s", borderMap[borderType],
             ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
             ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
 
@@ -684,7 +684,7 @@ static bool ocl_Laplacian3_8UC1(InputArray _src, OutputArray _dst, int ddepth,
 
     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
     char build_opts[1024];
-    sprintf(build_opts, "-D %s %s", borderMap[borderType],
+    snprintf(build_opts, sizeof(build_opts), "-D %s %s", borderMap[borderType],
             ocl::kernelToStr(kernel, CV_32F, "KERNEL_MATRIX").c_str());
 
     ocl::Kernel k("laplacian3_8UC1_cols16_rows2", cv::ocl::imgproc::laplacian3_oclsrc, build_opts);
diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp
index c9d1bb457c..097dd32055 100644
--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -637,7 +637,7 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
 
         char build_options[1024];
-        sprintf(build_options, "-D cn=%d "
+        snprintf(build_options, sizeof(build_options), "-D cn=%d "
                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
index b001a37f89..f593f7bc2d 100644
--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -354,7 +354,7 @@ static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize,
 
     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
     char build_opts[1024];
-    sprintf(build_opts, "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED],
+    snprintf(build_opts, sizeof(build_opts), "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED],
             ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
             ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
 
diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
index 5ca3d09ec4..eea5e447e9 100644
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@@ -48,7 +48,7 @@ void __wrap_printf_func(const char* fmt, ...)
     va_list args;
     va_start(args, fmt);
     char buffer[256];
-    vsprintf (buffer, fmt, args);
+    vsnprintf (buffer, sizeof(buffer), fmt, args);
     cvtest::TS::ptr()->printf(cvtest::TS::SUMMARY, buffer);
     va_end(args);
 }
diff --git a/modules/python/src2/pycompat.hpp b/modules/python/src2/pycompat.hpp
index 18336d4295..ceaee06f49 100644
--- a/modules/python/src2/pycompat.hpp
+++ b/modules/python/src2/pycompat.hpp
@@ -234,7 +234,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##CLASS_ID##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "< " MODULESTR SCOPE"."#EXPORT_NAME" %p>", self); \
+        snprintf(str, sizeof(str), "< " MODULESTR SCOPE"."#EXPORT_NAME" %p>", self); \
         return PyString_FromString(str); \
     }
 
@@ -294,7 +294,7 @@ PyObject* pyopencv_from(const TYPE& src)
     static PyObject* pyopencv_##CLASS_ID##_repr(PyObject* self) \
     { \
         char str[1000]; \
-        sprintf(str, "< " MODULESTR SCOPE"."#EXPORT_NAME" %p>", self); \
+        snprintf(str, sizeof(str), "< " MODULESTR SCOPE"."#EXPORT_NAME" %p>", self); \
         return PyString_FromString(str); \
     } \
     static PyType_Slot pyopencv_##CLASS_ID##_Slots[] =  \
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index e629c307fb..3539f17646 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -312,7 +312,7 @@ void BaseTest::safe_run( int start_from )
             char buf[1 << 16];
 
             const char* delim = exc.err.find('\n') == cv::String::npos ? "" : "\n";
-            sprintf( buf, "OpenCV Error:\n\t%s (%s%s) in %s, file %s, line %d",
+            snprintf( buf, sizeof(buf), "OpenCV Error:\n\t%s (%s%s) in %s, file %s, line %d",
                     errorStr, delim, exc.err.c_str(), exc.func.size() > 0 ?
                     exc.func.c_str() : "unknown function", exc.file.c_str(), exc.line );
             ts->printf(TS::LOG, "%s\n", buf);
@@ -603,7 +603,7 @@ void TS::set_gtest_status()
         return SUCCEED();
 
     char seedstr[32];
-    sprintf(seedstr, "%08x%08x", (unsigned)(current_test_info.rng_seed>>32),
+    snprintf(seedstr, sizeof(seedstr), "%08x%08x", (unsigned)(current_test_info.rng_seed>>32),
                                 (unsigned)(current_test_info.rng_seed));
 
     string logs = "";
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 525150894e..d64f52927b 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -29,7 +29,7 @@ string vec2str( const string& sep, const int* v, size_t nelems )
     string result = "";
     for( size_t i = 0; i < nelems; i++ )
     {
-        sprintf(buf, "%d", v[i]);
+        snprintf(buf, sizeof(buf), "%d", v[i]);
         result += string(buf);
         if( i < nelems - 1 )
             result += sep;
@@ -2154,15 +2154,15 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
     switch( code )
     {
     case CMP_EPS_BIG_DIFF:
-        sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
+        snprintf( msg, sizeof(msg), "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
         code = TS::FAIL_BAD_ACCURACY;
         break;
     case CMP_EPS_INVALID_TEST_DATA:
-        sprintf( msg, "%s: Invalid output", desc );
+        snprintf( msg, sizeof(msg), "%s: Invalid output", desc );
         code = TS::FAIL_INVALID_OUTPUT;
         break;
     case CMP_EPS_INVALID_REF_DATA:
-        sprintf( msg, "%s: Invalid reference output", desc );
+        snprintf( msg, sizeof(msg), "%s: Invalid reference output", desc );
         code = TS::FAIL_INVALID_OUTPUT;
         break;
     default:
diff --git a/modules/video/test/test_optflowpyrlk.cpp b/modules/video/test/test_optflowpyrlk.cpp
index a79a0ff4e4..6e0ee3756e 100644
--- a/modules/video/test/test_optflowpyrlk.cpp
+++ b/modules/video/test/test_optflowpyrlk.cpp
@@ -78,7 +78,7 @@ void CV_OptFlowPyrLKTest::run( int )
 
     for(;;)
     {
-    sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "lk_prev.dat" );
+    snprintf( filename, sizeof(filename), "%soptflow/%s", ts->get_data_path().c_str(), "lk_prev.dat" );
 
     {
         FileStorage fs(filename, FileStorage::READ);
@@ -91,7 +91,7 @@ void CV_OptFlowPyrLKTest::run( int )
         }
     }
 
-    sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "lk_next.dat" );
+    snprintf( filename, sizeof(filename), "%soptflow/%s", ts->get_data_path().c_str(), "lk_next.dat" );
 
     {
         FileStorage fs(filename, FileStorage::READ);
@@ -115,7 +115,7 @@ void CV_OptFlowPyrLKTest::run( int )
     }
 
     /* read first image */
-    sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "rock_1.bmp" );
+    snprintf( filename, sizeof(filename), "%soptflow/%s", ts->get_data_path().c_str(), "rock_1.bmp" );
     imgI = cv::imread( filename, cv::IMREAD_UNCHANGED );
 
     if( imgI.empty() )
@@ -126,7 +126,7 @@ void CV_OptFlowPyrLKTest::run( int )
     }
 
     /* read second image */
-    sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "rock_2.bmp" );
+    snprintf( filename, sizeof(filename), "%soptflow/%s", ts->get_data_path().c_str(), "rock_2.bmp" );
     imgJ = cv::imread( filename, cv::IMREAD_UNCHANGED );
 
     if( imgJ.empty() )
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index 2f2ff51413..e4de20bcac 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -88,11 +88,6 @@ Thanks to:
 */
 /////////////////////////////////////////////////////////
 
-#if defined _MSC_VER && _MSC_VER >= 100
-//'sprintf': name was marked as #pragma deprecated
-#pragma warning(disable: 4995)
-#endif
-
 #if defined(__clang__)  // clang or MSVC clang
 #pragma clang diagnostic ignored "-Wnon-virtual-dtor"
 #elif defined(__GNUC__) // MinGW
@@ -2304,29 +2299,29 @@ void videoInput::processPixels(unsigned char * src, unsigned char * dst, int wid
 void videoInput::getMediaSubtypeAsString(GUID type, char * typeAsString){
 
     char tmpStr[8];
-    if( type == MEDIASUBTYPE_RGB24)     sprintf(tmpStr, "RGB24");
-    else if(type == MEDIASUBTYPE_RGB32) sprintf(tmpStr, "RGB32");
-    else if(type == MEDIASUBTYPE_RGB555)sprintf(tmpStr, "RGB555");
-    else if(type == MEDIASUBTYPE_RGB565)sprintf(tmpStr, "RGB565");
-    else if(type == MEDIASUBTYPE_YUY2)  sprintf(tmpStr, "YUY2");
-    else if(type == MEDIASUBTYPE_YVYU)  sprintf(tmpStr, "YVYU");
-    else if(type == MEDIASUBTYPE_YUYV)  sprintf(tmpStr, "YUYV");
-    else if(type == MEDIASUBTYPE_IYUV)  sprintf(tmpStr, "IYUV");
-    else if(type == MEDIASUBTYPE_UYVY)  sprintf(tmpStr, "UYVY");
-    else if(type == MEDIASUBTYPE_YV12)  sprintf(tmpStr, "YV12");
-    else if(type == MEDIASUBTYPE_YVU9)  sprintf(tmpStr, "YVU9");
-    else if(type == MEDIASUBTYPE_Y411)  sprintf(tmpStr, "Y411");
-    else if(type == MEDIASUBTYPE_Y41P)  sprintf(tmpStr, "Y41P");
-    else if(type == MEDIASUBTYPE_Y211)  sprintf(tmpStr, "Y211");
-    else if(type == MEDIASUBTYPE_AYUV)  sprintf(tmpStr, "AYUV");
-    else if(type == MEDIASUBTYPE_MJPG)  sprintf(tmpStr, "MJPG");
-    else if(type == MEDIASUBTYPE_Y800)  sprintf(tmpStr, "Y800");
-    else if(type == MEDIASUBTYPE_Y8)    sprintf(tmpStr, "Y8");
-    else if(type == MEDIASUBTYPE_GREY)  sprintf(tmpStr, "GREY");
-    else if(type == MEDIASUBTYPE_I420)  sprintf(tmpStr, "I420");
-    else if (type == MEDIASUBTYPE_BY8)  sprintf(tmpStr, "BY8");
-    else if (type == MEDIASUBTYPE_Y16)  sprintf(tmpStr, "Y16");
-    else sprintf(tmpStr, "OTHER");
+    if( type == MEDIASUBTYPE_RGB24)     snprintf(tmpStr, sizeof(tmpStr), "RGB24");
+    else if(type == MEDIASUBTYPE_RGB32) snprintf(tmpStr, sizeof(tmpStr), "RGB32");
+    else if(type == MEDIASUBTYPE_RGB555)snprintf(tmpStr, sizeof(tmpStr), "RGB555");
+    else if(type == MEDIASUBTYPE_RGB565)snprintf(tmpStr, sizeof(tmpStr), "RGB565");
+    else if(type == MEDIASUBTYPE_YUY2)  snprintf(tmpStr, sizeof(tmpStr), "YUY2");
+    else if(type == MEDIASUBTYPE_YVYU)  snprintf(tmpStr, sizeof(tmpStr), "YVYU");
+    else if(type == MEDIASUBTYPE_YUYV)  snprintf(tmpStr, sizeof(tmpStr), "YUYV");
+    else if(type == MEDIASUBTYPE_IYUV)  snprintf(tmpStr, sizeof(tmpStr), "IYUV");
+    else if(type == MEDIASUBTYPE_UYVY)  snprintf(tmpStr, sizeof(tmpStr), "UYVY");
+    else if(type == MEDIASUBTYPE_YV12)  snprintf(tmpStr, sizeof(tmpStr), "YV12");
+    else if(type == MEDIASUBTYPE_YVU9)  snprintf(tmpStr, sizeof(tmpStr), "YVU9");
+    else if(type == MEDIASUBTYPE_Y411)  snprintf(tmpStr, sizeof(tmpStr), "Y411");
+    else if(type == MEDIASUBTYPE_Y41P)  snprintf(tmpStr, sizeof(tmpStr), "Y41P");
+    else if(type == MEDIASUBTYPE_Y211)  snprintf(tmpStr, sizeof(tmpStr), "Y211");
+    else if(type == MEDIASUBTYPE_AYUV)  snprintf(tmpStr, sizeof(tmpStr), "AYUV");
+    else if(type == MEDIASUBTYPE_MJPG)  snprintf(tmpStr, sizeof(tmpStr), "MJPG");
+    else if(type == MEDIASUBTYPE_Y800)  snprintf(tmpStr, sizeof(tmpStr), "Y800");
+    else if(type == MEDIASUBTYPE_Y8)    snprintf(tmpStr, sizeof(tmpStr), "Y8");
+    else if(type == MEDIASUBTYPE_GREY)  snprintf(tmpStr, sizeof(tmpStr), "GREY");
+    else if(type == MEDIASUBTYPE_I420)  snprintf(tmpStr, sizeof(tmpStr), "I420");
+    else if (type == MEDIASUBTYPE_BY8)  snprintf(tmpStr, sizeof(tmpStr), "BY8");
+    else if (type == MEDIASUBTYPE_Y16)  snprintf(tmpStr, sizeof(tmpStr), "Y16");
+    else snprintf(tmpStr, sizeof(tmpStr), "OTHER");
 
     memcpy(typeAsString, tmpStr, sizeof(char)*8);
 }
@@ -2352,15 +2347,15 @@ void videoInput::getVideoPropertyAsString(int prop, char * propertyAsString){
 
     char tmpStr[16];
 
-    if ( prop==VideoProcAmp_Brightness) sprintf(tmpStr, "Brightness");
-    else if ( prop==VideoProcAmp_Contrast) sprintf(tmpStr, "Contrast");
-    else if ( prop==VideoProcAmp_Saturation) sprintf(tmpStr, "Saturation");
-    else if ( prop==VideoProcAmp_Hue) sprintf(tmpStr, "Hue");
-    else if ( prop==VideoProcAmp_Gain) sprintf(tmpStr, "Gain");
-    else if ( prop==VideoProcAmp_Gamma) sprintf(tmpStr, "Gamma");
-    else if ( prop==VideoProcAmp_ColorEnable) sprintf(tmpStr, "ColorEnable");
-    else if ( prop==VideoProcAmp_Sharpness) sprintf(tmpStr, "Sharpness");
-    else sprintf(tmpStr, "%u",prop);
+    if ( prop==VideoProcAmp_Brightness) snprintf(tmpStr, sizeof(tmpStr), "Brightness");
+    else if ( prop==VideoProcAmp_Contrast) snprintf(tmpStr, sizeof(tmpStr), "Contrast");
+    else if ( prop==VideoProcAmp_Saturation) snprintf(tmpStr, sizeof(tmpStr), "Saturation");
+    else if ( prop==VideoProcAmp_Hue) snprintf(tmpStr, sizeof(tmpStr), "Hue");
+    else if ( prop==VideoProcAmp_Gain) snprintf(tmpStr, sizeof(tmpStr), "Gain");
+    else if ( prop==VideoProcAmp_Gamma) snprintf(tmpStr, sizeof(tmpStr), "Gamma");
+    else if ( prop==VideoProcAmp_ColorEnable) snprintf(tmpStr, sizeof(tmpStr), "ColorEnable");
+    else if ( prop==VideoProcAmp_Sharpness) snprintf(tmpStr, sizeof(tmpStr), "Sharpness");
+    else snprintf(tmpStr, sizeof(tmpStr), "%u",prop);
 
     memcpy(propertyAsString, tmpStr, sizeof(char)*16);
 }
@@ -2455,14 +2450,14 @@ void videoInput::getCameraPropertyAsString(int prop, char * propertyAsString){
 
     char tmpStr[16];
 
-    if ( prop==CameraControl_Pan) sprintf(tmpStr, "Pan");
-    else if ( prop==CameraControl_Tilt) sprintf(tmpStr, "Tilt");
-    else if ( prop==CameraControl_Roll) sprintf(tmpStr, "Roll");
-    else if ( prop==CameraControl_Zoom) sprintf(tmpStr, "Zoom");
-    else if ( prop==CameraControl_Exposure) sprintf(tmpStr, "Exposure");
-    else if ( prop==CameraControl_Iris) sprintf(tmpStr, "Iris");
-    else if ( prop==CameraControl_Focus) sprintf(tmpStr, "Focus");
-    else sprintf(tmpStr, "%u",prop);
+    if ( prop==CameraControl_Pan) snprintf(tmpStr, sizeof(tmpStr), "Pan");
+    else if ( prop==CameraControl_Tilt) snprintf(tmpStr, sizeof(tmpStr), "Tilt");
+    else if ( prop==CameraControl_Roll) snprintf(tmpStr, sizeof(tmpStr), "Roll");
+    else if ( prop==CameraControl_Zoom) snprintf(tmpStr, sizeof(tmpStr), "Zoom");
+    else if ( prop==CameraControl_Exposure) snprintf(tmpStr, sizeof(tmpStr), "Exposure");
+    else if ( prop==CameraControl_Iris) snprintf(tmpStr, sizeof(tmpStr), "Iris");
+    else if ( prop==CameraControl_Focus) snprintf(tmpStr, sizeof(tmpStr), "Focus");
+    else snprintf(tmpStr, sizeof(tmpStr), "%u",prop);
 
     memcpy(propertyAsString, tmpStr, sizeof(char)*16);
 }
diff --git a/modules/videoio/src/cap_gphoto2.cpp b/modules/videoio/src/cap_gphoto2.cpp
index 40fcdc4403..8b3898b2ad 100644
--- a/modules/videoio/src/cap_gphoto2.cpp
+++ b/modules/videoio/src/cap_gphoto2.cpp
@@ -667,7 +667,7 @@ double DigitalCameraCapture::getProperty(int propertyId) const
     catch (const GPhoto2Exception & e)
     {
         char buf[128] = "";
-        sprintf(buf, "cannot get property: %d", propertyId);
+        snprintf(buf, sizeof(buf), "cannot get property: %d", propertyId);
         message(WARNING, (const char *) buf, e);
         return 0;
     }
@@ -810,7 +810,7 @@ bool DigitalCameraCapture::setProperty(int propertyId, double value)
     catch (const GPhoto2Exception & e)
     {
         char buf[128] = "";
-        sprintf(buf, "cannot set property: %d to %f", propertyId, value);
+        snprintf(buf, sizeof(buf), "cannot set property: %d to %f", propertyId, value);
         message(WARNING, (const char *) buf, e);
         return false;
     }
diff --git a/modules/videoio/src/cap_ximea.cpp b/modules/videoio/src/cap_ximea.cpp
index 7d572a2372..4573953939 100644
--- a/modules/videoio/src/cap_ximea.cpp
+++ b/modules/videoio/src/cap_ximea.cpp
@@ -1784,7 +1784,7 @@ void CvCaptureCAM_XIMEA::errMsg(const char* msg, int errNum) const
 
     #if defined _WIN32
     char buf[512]="";
-    sprintf( buf, "%s : %d, %s\n", msg, errNum, error_message.c_str());
+    snprintf( buf, sizeof(buf), "%s : %d, %s\n", msg, errNum, error_message.c_str());
     OutputDebugString(buf);
     #else
     fprintf(stderr, "%s : %d, %s\n", msg, errNum, error_message.c_str());
diff --git a/modules/videoio/src/cap_xine.cpp b/modules/videoio/src/cap_xine.cpp
index 67de7b9319..08062fe4cf 100644
--- a/modules/videoio/src/cap_xine.cpp
+++ b/modules/videoio/src/cap_xine.cpp
@@ -111,7 +111,7 @@ class XINECapture : public IVideoCapture
         char configfile[2048] = {0};
 
         xine = xine_new();
-        sprintf(configfile, "%s%s", xine_get_homedir(), "/.xine/config");
+        snprintf(configfile, sizeof(configfile), "%s%s", xine_get_homedir(), "/.xine/config");
         xine_config_load(xine, configfile);
         xine_init(xine);
         xine_engine_set_param(xine, 0, 0);
diff --git a/samples/cpp/calibration.cpp b/samples/cpp/calibration.cpp
index f1ea78b304..4f2bb8b8b5 100644
--- a/samples/cpp/calibration.cpp
+++ b/samples/cpp/calibration.cpp
@@ -225,7 +225,7 @@ static void saveCameraParams( const string& filename,
 
     if( flags != 0 )
     {
-        sprintf( buf, "flags: %s%s%s%s",
+        snprintf( buf, sizeof(buf), "flags: %s%s%s%s",
             flags & CALIB_USE_INTRINSIC_GUESS ? "+use_intrinsic_guess" : "",
             flags & CALIB_FIX_ASPECT_RATIO ? "+fix_aspectRatio" : "",
             flags & CALIB_FIX_PRINCIPAL_POINT ? "+fix_principal_point" : "",
diff --git a/samples/cpp/select3dobj.cpp b/samples/cpp/select3dobj.cpp
index f946565cf6..0f19eb2332 100644
--- a/samples/cpp/select3dobj.cpp
+++ b/samples/cpp/select3dobj.cpp
@@ -469,7 +469,7 @@ int main(int argc, char** argv)
         outbarename = strrchr(outprefix.c_str(), '/');
         const char* tmp = strrchr(outprefix.c_str(), '\\');
         char cmd[1000];
-        sprintf(cmd, "mkdir %s", outprefix.c_str());
+        snprintf(cmd, sizeof(cmd), "mkdir %s", outprefix.c_str());
         if( tmp && tmp > outbarename )
             outbarename = tmp;
         if( outbarename )
@@ -568,7 +568,7 @@ int main(int argc, char** argv)
                     char path[1000];
                     for(;frameIdx < maxFrameIdx;frameIdx++)
                     {
-                        sprintf(path, "%s%04d.jpg", outprefix.c_str(), frameIdx);
+                        snprintf(path, sizeof(path), "%s%04d.jpg", outprefix.c_str(), frameIdx);
                         FILE* f = fopen(path, "rb");
                         if( !f )
                             break;
diff --git a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
index eeeba0002d..82bbb3743a 100644
--- a/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+++ b/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
@@ -60,7 +60,7 @@ int main( void )
 
    //![create_trackbar]
    char TrackbarName[50];
-   sprintf( TrackbarName, "Alpha x %d", alpha_slider_max );
+   snprintf( TrackbarName, sizeof(TrackbarName), "Alpha x %d", alpha_slider_max );
    createTrackbar( TrackbarName, "Linear Blend", &alpha_slider, alpha_slider_max, on_trackbar );
    //![create_trackbar]
 
diff --git a/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp b/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
index 12fa051ba3..5d1736a77a 100644
--- a/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/HoughLines_Demo.cpp
@@ -58,7 +58,7 @@ int main( int argc, char** argv )
 
    /// Create Trackbars for Thresholds
    char thresh_label[50];
-   sprintf( thresh_label, "Thres: %d + input", min_threshold );
+   snprintf( thresh_label, sizeof(thresh_label), "Thres: %d + input", min_threshold );
 
    namedWindow( standard_name, WINDOW_AUTOSIZE );
    createTrackbar( thresh_label, standard_name, &s_trackbar, max_trackbar, Standard_Hough);
diff --git a/samples/cpp/videocapture_starter.cpp b/samples/cpp/videocapture_starter.cpp
index 58f1145ed5..8e53104331 100644
--- a/samples/cpp/videocapture_starter.cpp
+++ b/samples/cpp/videocapture_starter.cpp
@@ -57,7 +57,7 @@ namespace {
             case 27: //escape key
                 return 0;
             case ' ': //Save an image
-                sprintf(filename,"filename%.3d.jpg",n++);
+                snprintf(filename,sizeof(filename),"filename%.3d.jpg",n++);
                 imwrite(filename,frame);
                 cout << "Saved " << filename << endl;
                 break;
diff --git a/samples/directx/d3d9_interop.cpp b/samples/directx/d3d9_interop.cpp
index 4806a4078d..c46cf8e9e8 100644
--- a/samples/directx/d3d9_interop.cpp
+++ b/samples/directx/d3d9_interop.cpp
@@ -258,22 +258,22 @@ public:
             int  y = 0;
 
             buf[0] = 0;
-            sprintf(buf, "mode: %s", m_modeStr[mode].c_str());
+            snprintf(buf, sizeof(buf), "mode: %s", m_modeStr[mode].c_str());
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, m_demo_processing ? "blur frame" : "copy frame");
+            snprintf(buf, sizeof(buf), m_demo_processing ? "blur frame" : "copy frame");
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, "time: %4.1f msec", time);
+            snprintf(buf, sizeof(buf), "time: %4.1f msec", time);
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, "OpenCL device: %s", oclDevName.c_str());
+            snprintf(buf, sizeof(buf), "OpenCL device: %s", oclDevName.c_str());
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             ::SelectObject(hDC, hOldFont);
diff --git a/samples/directx/d3d9ex_interop.cpp b/samples/directx/d3d9ex_interop.cpp
index 24258e3702..68b864f0eb 100644
--- a/samples/directx/d3d9ex_interop.cpp
+++ b/samples/directx/d3d9ex_interop.cpp
@@ -259,22 +259,22 @@ public:
             int  y = 0;
 
             buf[0] = 0;
-            sprintf(buf, "mode: %s", m_modeStr[mode].c_str());
+            snprintf(buf, sizeof(buf), "mode: %s", m_modeStr[mode].c_str());
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, m_demo_processing ? "blur frame" : "copy frame");
+            snprintf(buf, sizeof(buf), m_demo_processing ? "blur frame" : "copy frame");
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, "time: %4.1f msec", time);
+            snprintf(buf, sizeof(buf), "time: %4.1f msec", time);
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             y += tm.tmHeight;
             buf[0] = 0;
-            sprintf(buf, "OpenCL device: %s", oclDevName.c_str());
+            snprintf(buf, sizeof(buf), "OpenCL device: %s", oclDevName.c_str());
             ::TextOut(hDC, 0, y, buf, (int)strlen(buf));
 
             ::SelectObject(hDC, hOldFont);

From 2366f2cb2ec8266e660a397968db832b839981dd Mon Sep 17 00:00:00 2001
From: dan <doprea@gmail.com>
Date: Sat, 25 Jun 2022 15:12:59 +0200
Subject: [PATCH 092/178] issues-22126

---
 modules/videoio/src/cap_ffmpeg_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index a724d7f724..442eb7e4b8 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -1408,7 +1408,7 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
             img_convert_ctx,
             picture->data,
             picture->linesize,
-            0, video_st->codec->coded_height,
+            0, picture->height,
             rgb_picture.data,
             rgb_picture.linesize
             );

From 90f2e1f8b5a9870fa5da37ea6358b7106fbcfd6e Mon Sep 17 00:00:00 2001
From: Philipp Hutterer <dev.philipphutterer@gmail.com>
Date: Sun, 26 Jun 2022 17:30:31 +0200
Subject: [PATCH 093/178] Frame type property for ffmpeg video capture

---
 modules/videoio/include/opencv2/videoio.hpp | 1 +
 modules/videoio/src/cap_ffmpeg_impl.hpp     | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 35491a0b5e..173c73c454 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -202,6 +202,7 @@ enum VideoCaptureProperties {
        CAP_PROP_AUDIO_SYNCHRONIZE = 66, //!< (open, read) Enables audio synchronization.
        CAP_PROP_LRF_HAS_KEY_FRAME = 67, //!< FFmpeg back-end only - Indicates whether the Last Raw Frame (LRF), output from VideoCapture::read() when VideoCapture is initialized with VideoCapture::open(CAP_FFMPEG, {CAP_PROP_FORMAT, -1}) or VideoCapture::set(CAP_PROP_FORMAT,-1) is called before the first call to VideoCapture::read(), contains encoded data for a key frame.
        CAP_PROP_CODEC_EXTRADATA_INDEX = 68, //!< Positive index indicates that returning extra data is supported by the video back end.  This can be retrieved as cap.retrieve(data, <returned index>).  E.g. When reading from a h264 encoded RTSP stream, the FFmpeg backend could return the SPS and/or PPS if available (if sent in reply to a DESCRIBE request), from calls to cap.retrieve(data, <returned index>).
+       CAP_PROP_FRAME_TYPE = 69, //!< (read-only) FFmpeg back-end only - Frame type ascii code (73 = 'I', 80 = 'P', 66 = 'B' or 63 = '?' if unknown) of the most recently read frame.
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 98913ed839..b5de6c6d14 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -1695,6 +1695,8 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
         return (double)((rotation_auto && ((rotation_angle%180) != 0)) ? frame.height : frame.width);
     case CAP_PROP_FRAME_HEIGHT:
         return (double)((rotation_auto && ((rotation_angle%180) != 0)) ? frame.width : frame.height);
+    case CAP_PROP_FRAME_TYPE:
+        return (double)av_get_picture_type_char(picture->pict_type);
     case CAP_PROP_FPS:
         return get_fps();
     case CAP_PROP_FOURCC:

From 0769bf416f17c874afd9f6d03aa9111f132e9120 Mon Sep 17 00:00:00 2001
From: Joel Winarske <joel.winarske@gmail.com>
Date: Tue, 14 Jun 2022 15:16:51 -0700
Subject: [PATCH 094/178] highgui Wayland xdg_shell

-enable using -DWITH_WAYLAND=ON
-adapted from https://github.com/pfpacket/opencv-wayland
-using xdg_shell stable protocol
-overrides HAVE_QT if HAVE_WAYLAND and WITH_WAYLAND are set

Signed-off-by: Joel Winarske <joel.winarske@gmail.com>

Co-authored-by: Ryo Munakata <afpacket@gmail.com>
---
 CMakeLists.txt                             |   21 +
 modules/core/CMakeLists.txt                |    1 +
 modules/highgui/CMakeLists.txt             |   26 +-
 modules/highgui/cmake/detect_wayland.cmake |   35 +
 modules/highgui/cmake/init.cmake           |    1 +
 modules/highgui/src/precomp.hpp            |    1 +
 modules/highgui/src/window.cpp             |    3 +
 modules/highgui/src/window_wayland.cpp     | 2476 ++++++++++++++++++++
 modules/highgui/test/test_gui.cpp          |    3 +
 9 files changed, 2566 insertions(+), 1 deletion(-)
 create mode 100644 modules/highgui/cmake/detect_wayland.cmake
 create mode 100644 modules/highgui/src/window_wayland.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa409f516c..5a06a51ad9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,6 +281,9 @@ OCV_OPTION(WITH_GTK "Include GTK support" ON
 OCV_OPTION(WITH_GTK_2_X "Use GTK version 2" OFF
   VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
   VERIFY HAVE_GTK AND NOT HAVE_GTK3)
+OCV_OPTION(WITH_WAYLAND "Include Wayland support" OFF
+        VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
+        VERIFY HAVE_WAYLAND)
 OCV_OPTION(WITH_IPP "Include Intel IPP support" (NOT MINGW AND NOT CV_DISABLE_OPTIMIZATION)
   VISIBLE_IF (X86_64 OR X86) AND NOT WINRT AND NOT IOS
   VERIFY HAVE_IPP)
@@ -1260,6 +1263,24 @@ endif(WIN32)
 status("")
 status("  GUI: " "${OPENCV_HIGHGUI_BUILTIN_BACKEND}")
 
+if(WITH_WAYLAND OR HAVE_WAYLAND)
+  if(HAVE_WAYLAND_CLIENT)
+    status("    Wayland Client:" "YES (ver ${WAYLAND_CLIENT_VERSION})")
+  endif()
+  if(HAVE_WAYLAND_CURSOR)
+    status("    Wayland Cursor:" "YES (ver ${WAYLAND_CURSOR_VERSION})")
+  endif()
+  if(HAVE_WAYLAND_PROTOCOL)
+    status("    Wayland Protocol:" "YES (ver ${WAYLAND_PROTOCOL_VERSION})")
+  endif()
+  if(HAVE_WAYLAND_EGL)
+    status("    Wayland EGL:" "YES (ver ${WAYLAND_EGL_VERSION})")
+  endif()
+  if(HAVE_XKBCOMMON)
+    status("    Xkbcommon:" "YES (ver ${XKBCOMMON_VERSION})")
+  endif()
+endif()
+
 if(WITH_QT OR HAVE_QT)
   if(HAVE_QT)
     status("    QT:" "YES (ver ${QT_VERSION_MAJOR}.${QT_VERSION_MINOR}.${QT_VERSION_PATCH} ${QT_EDITION})")
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index b78bb98162..0ed2ad59b1 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -146,6 +146,7 @@ ocv_create_module(${extra_libs})
 ocv_target_link_libraries(${the_module} PRIVATE
     "${ZLIB_LIBRARIES}" "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
     "${OPENGL_LIBRARIES}"
+    "${GLX_LIBRARIES}"
     "${LAPACK_LIBRARIES}" "${CPUFEATURES_LIBRARIES}" "${HALIDE_LIBRARIES}"
     "${ITT_LIBRARIES}"
     "${OPENCV_HAL_LINKER_LIBS}"
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 65d24e0ab0..201f2e7cf1 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -49,7 +49,31 @@ list(REMOVE_ITEM highgui_ext_hdrs "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${n
 
 set(OPENCV_HIGHGUI_BUILTIN_BACKEND "")
 
-if(HAVE_QT)
+if(WITH_WAYLAND AND HAVE_WAYLAND)
+  set(OPENCV_HIGHGUI_BUILTIN_BACKEND "Wayland")
+  add_definitions(-DHAVE_WAYLAND)
+
+  set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+  if (HAVE_WAYLAND_PROTOCOLS)
+      ocv_wayland_generate(
+            ${WAYLAND_PROTOCOLS_BASE}/stable/xdg-shell/xdg-shell.xml
+            xdg-shell-client-protocol)
+  endif()
+
+  list(APPEND highgui_srcs
+    ${CMAKE_CURRENT_LIST_DIR}/src/window_wayland.cpp
+    ${WAYLAND_PROTOCOL_SOURCES}
+  )
+  list(APPEND HIGHGUI_LIBRARIES "${WAYLAND_CLIENT_LINK_LIBRARIES};${WAYLAND_CURSOR_LINK_LIBRARIES};${XKBCOMMON_LINK_LIBRARIES}")
+
+  if(HAVE_WAYLAND_EGL)
+    if(WAYLAND_EGL_LIBRARIES)
+      list(APPEND HIGHGUI_LIBRARIES "${WAYLAND_EGL_LIBRARIES}")
+    endif()
+  endif()
+
+elseif(HAVE_QT)
   set(OPENCV_HIGHGUI_BUILTIN_BACKEND "QT${QT_VERSION_MAJOR}")
   add_definitions(-DHAVE_QT)
 
diff --git a/modules/highgui/cmake/detect_wayland.cmake b/modules/highgui/cmake/detect_wayland.cmake
new file mode 100644
index 0000000000..24e70d1b85
--- /dev/null
+++ b/modules/highgui/cmake/detect_wayland.cmake
@@ -0,0 +1,35 @@
+# --- Wayland ---
+macro(ocv_wayland_generate protocol_file output_file)
+    add_custom_command(OUTPUT ${output_file}.h
+            COMMAND ${WAYLAND_SCANNER_EXECUTABLE} client-header < ${protocol_file} > ${output_file}.h
+            DEPENDS ${protocol_file})
+    add_custom_command(OUTPUT ${output_file}.c
+            COMMAND ${WAYLAND_SCANNER_EXECUTABLE} private-code < ${protocol_file} > ${output_file}.c
+            DEPENDS ${protocol_file})
+    list(APPEND WAYLAND_PROTOCOL_SOURCES ${output_file}.h ${output_file}.c)
+endmacro()
+
+ocv_clear_vars(HAVE_WAYLAND_CLIENT HAVE_WAYLAND_CURSOR HAVE_XKBCOMMON HAVE_WAYLAND_PROTOCOLS)
+if(WITH_WAYLAND)
+    ocv_check_modules(WAYLAND_CLIENT wayland-client)
+    if(WAYLAND_CLIENT_FOUND)
+        set(HAVE_WAYLAND_CLIENT ON)
+    endif()
+    ocv_check_modules(WAYLAND_CURSOR wayland-cursor)
+    if(WAYLAND_CURSOR_FOUND)
+        set(HAVE_WAYLAND_CURSOR ON)
+    endif()
+    ocv_check_modules(XKBCOMMON xkbcommon)
+    if(XKBCOMMON_FOUND)
+        set(HAVE_XKBCOMMON ON)
+    endif()
+    ocv_check_modules(WAYLAND_PROTOCOLS wayland-protocols>=1.13)
+    if(HAVE_WAYLAND_PROTOCOLS)
+        pkg_get_variable(WAYLAND_PROTOCOLS_BASE wayland-protocols pkgdatadir)
+        find_host_program(WAYLAND_SCANNER_EXECUTABLE NAMES wayland-scanner REQUIRED)
+    endif()
+
+    if(HAVE_WAYLAND_CLIENT AND HAVE_WAYLAND_CURSOR AND HAVE_XKBCOMMON AND HAVE_WAYLAND_PROTOCOLS)
+        set(HAVE_WAYLAND TRUE)
+    endif()
+endif()
diff --git a/modules/highgui/cmake/init.cmake b/modules/highgui/cmake/init.cmake
index 0b4178c868..49d4799b30 100644
--- a/modules/highgui/cmake/init.cmake
+++ b/modules/highgui/cmake/init.cmake
@@ -38,6 +38,7 @@ endmacro()
 
 add_backend("gtk" WITH_GTK)
 add_backend("win32ui" WITH_WIN32UI)
+add_backend("wayland" WITH_WAYLAND)
 # TODO cocoa
 # TODO qt
 # TODO opengl
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 0d26b957ad..ce9efe4153 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -130,6 +130,7 @@ void setWindowTitle_W32(const cv::String& name, const cv::String& title);
 void setWindowTitle_GTK(const cv::String& name, const cv::String& title);
 void setWindowTitle_QT(const cv::String& name, const cv::String& title);
 void setWindowTitle_COCOA(const cv::String& name, const cv::String& title);
+void setWindowTitle_WAYLAND(const cv::String& name, const cv::String& title);
 
 int pollKey_W32();
 
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 81d205a69a..dd70fd2456 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -613,6 +613,8 @@ void cv::setWindowTitle(const String& winname, const String& title)
     return setWindowTitle_QT(winname, title);
 #elif defined (HAVE_COCOA)
     return setWindowTitle_COCOA(winname, title);
+#elif defined (HAVE_WAYLAND)
+    return setWindowTitle_WAYLAND(winname, title);
 #else
     CV_Error(Error::StsNotImplemented, "The function is not implemented. "
         "Rebuild the library with Windows, GTK+ 2.x or Cocoa support. "
@@ -1226,6 +1228,7 @@ int cv::createButton(const String&, ButtonCallback, void*, int , bool )
 #elif defined (HAVE_GTK)      // see window_gtk.cpp
 #elif defined (HAVE_COCOA)    // see window_cocoa.mm
 #elif defined (HAVE_QT)       // see window_QT.cpp
+#elif defined (HAVE_WAYLAND)  // see window_wayland.cpp
 #elif defined (WINRT) && !defined (WINRT_8_0) // see window_winrt.cpp
 
 #else
diff --git a/modules/highgui/src/window_wayland.cpp b/modules/highgui/src/window_wayland.cpp
new file mode 100644
index 0000000000..69231c0072
--- /dev/null
+++ b/modules/highgui/src/window_wayland.cpp
@@ -0,0 +1,2476 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+#ifndef _WIN32
+#if defined (HAVE_WAYLAND)
+
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+#include <map>
+#include <queue>
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <chrono>
+#include <unordered_map>
+#include <thread>
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <sys/epoll.h>
+#include <linux/input.h>
+
+#include <xkbcommon/xkbcommon.h>
+#include <wayland-client-protocol.h>
+#include <wayland-cursor.h>
+#include <wayland-util.h>
+#include "xdg-shell-client-protocol.h"
+
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include "opencv2/core/utils/logger.hpp"
+
+/*                              */
+/*  OpenCV highgui internals    */
+/*                              */
+class cv_wl_display;
+
+class cv_wl_mouse;
+
+class cv_wl_keyboard;
+
+class cv_wl_input;
+
+class cv_wl_buffer;
+
+struct cv_wl_cursor;
+
+class cv_wl_cursor_theme;
+
+class cv_wl_widget;
+
+class cv_wl_titlebar;
+
+class cv_wl_viewer;
+
+class cv_wl_trackbar;
+
+class cv_wl_window;
+
+class cv_wl_core;
+
+using std::weak_ptr;
+using std::shared_ptr;
+using namespace cv::Error;
+namespace ch = std::chrono;
+
+#define throw_system_error(errmsg, errnum) \
+    CV_Error_(StsInternal, ("%s: %s", errmsg, strerror(errnum)));
+
+// workaround for Wayland macro not compiling in C++
+#define WL_ARRAY_FOR_EACH(pos, array, type)                             \
+  for ((pos) = (type)(array)->data;                                       \
+       (const char*)(pos) < ((const char*)(array)->data + (array)->size); \
+       (pos)++)
+
+static int xkb_keysym_to_ascii(xkb_keysym_t keysym) {
+    return static_cast<int>(keysym & 0xff);
+}
+
+
+static void draw_xrgb8888(void *d, uint8_t a, uint8_t r, uint8_t g, uint8_t b) {
+    *((uint32_t *) d) = ((a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void write_mat_to_xrgb8888(cv::Mat const &img, void *data) {
+    CV_Assert(data != nullptr);
+    CV_Assert(img.isContinuous());
+
+    for (int y = 0; y < img.rows; y++) {
+        for (int x = 0; x < img.cols; x++) {
+            auto p = img.at<cv::Vec3b>(y, x);
+            draw_xrgb8888((char *) data + (y * img.cols + x) * 4, 0x00, p[2], p[1], p[0]);
+        }
+    }
+}
+
+class epoller {
+public:
+    epoller() : epoll_fd_(epoll_create1(EPOLL_CLOEXEC)) {
+        if (epoll_fd_ < 0)
+            throw_system_error("Failed to create epoll fd", errno)
+    }
+
+    ~epoller() {
+        close(epoll_fd_);
+    }
+
+    void add(int fd, int events = EPOLLIN) const {
+        this->ctl(EPOLL_CTL_ADD, fd, events);
+    }
+
+    void modify(int fd, int events) const {
+        this->ctl(EPOLL_CTL_MOD, fd, events);
+    }
+
+    void remove(int fd) const {
+        this->ctl(EPOLL_CTL_DEL, fd, 0);
+    }
+
+    void ctl(int op, int fd, int events) const {
+        struct epoll_event event{0, {nullptr}};
+        event.events = events;
+        event.data.fd = fd;
+        int ret = epoll_ctl(epoll_fd_, op, fd, &event);
+        if (ret < 0)
+            throw_system_error("epoll_ctl", errno)
+    }
+
+    std::vector<struct epoll_event> wait(int timeout = -1, int max_events = 16) const {
+        std::vector<struct epoll_event> events(max_events);
+        int event_num = epoll_wait(epoll_fd_,
+                                   static_cast<epoll_event *>(events.data()), static_cast<int>(events.size()), timeout);
+        if (event_num < 0)
+            throw_system_error("epoll_wait", errno)
+        events.erase(events.begin() + event_num, events.end());
+        return events;
+    }
+
+    epoller(epoller const &) = delete;
+
+    epoller &operator=(epoller const &) = delete;
+
+    epoller(epoller &&) = delete;
+
+    epoller &operator=(epoller &&) = delete;
+
+private:
+    int epoll_fd_;
+};
+
+class cv_wl_display {
+public:
+    cv_wl_display();
+
+    explicit cv_wl_display(std::string const &disp);
+
+    ~cv_wl_display();
+
+    int dispatch();
+
+    int dispatch_pending();
+
+    int flush();
+
+    int run_once();
+
+    struct wl_shm *shm();
+
+    weak_ptr<cv_wl_input> input();
+
+    uint32_t formats() const;
+
+    struct wl_surface *get_surface();
+
+    struct xdg_surface *get_shell_surface(struct wl_surface *surface);
+
+private:
+    epoller poller_;
+    struct wl_display *display_{};
+    struct wl_registry *registry_{};
+    struct wl_registry_listener reg_listener_{
+            &handle_reg_global, &handle_reg_remove
+    };
+    struct wl_compositor *compositor_ = nullptr;
+    struct wl_shm *shm_ = nullptr;
+    struct wl_shm_listener shm_listener_{&handle_shm_format};
+    struct xdg_wm_base *xdg_wm_base_ = nullptr;
+    struct xdg_wm_base_listener xdg_wm_base_listener_{&handle_shell_ping};
+    shared_ptr<cv_wl_input> input_;
+    uint32_t formats_ = 0;
+    bool buffer_scale_enable_{};
+
+    void init(const char *display);
+
+    static void
+    handle_reg_global(void *data, struct wl_registry *reg, uint32_t name, const char *iface, uint32_t version);
+
+    static void handle_reg_remove(void *data, struct wl_registry *wl_registry, uint32_t name);
+
+    static void handle_shm_format(void *data, struct wl_shm *wl_shm, uint32_t format);
+
+    static void handle_shell_ping(void *data, struct xdg_wm_base *shell, uint32_t serial);
+};
+
+class cv_wl_mouse {
+public:
+    enum button {
+        NONE = 0,
+        LBUTTON = BTN_LEFT,
+        RBUTTON = BTN_RIGHT,
+        MBUTTON = BTN_MIDDLE,
+    };
+
+    explicit cv_wl_mouse(struct wl_pointer *pointer);
+
+    ~cv_wl_mouse();
+
+    void set_cursor(uint32_t serial, struct wl_surface *surface, int32_t hotspot_x, int32_t hotspot_y);
+
+private:
+    struct wl_pointer *pointer_;
+    struct wl_pointer_listener pointer_listener_{
+            &handle_pointer_enter, &handle_pointer_leave,
+            &handle_pointer_motion, &handle_pointer_button,
+            &handle_pointer_axis, &handle_pointer_frame,
+            &handle_pointer_axis_source, &handle_pointer_axis_stop,
+            &handle_pointer_axis_discrete
+    };
+    cv_wl_window *focus_window_{};
+
+    static void
+    handle_pointer_enter(void *data, struct wl_pointer *pointer, uint32_t serial, struct wl_surface *surface,
+                         wl_fixed_t sx, wl_fixed_t sy);
+
+    static void
+    handle_pointer_leave(void *data, struct wl_pointer *pointer, uint32_t serial, struct wl_surface *surface);
+
+    static void
+    handle_pointer_motion(void *data, struct wl_pointer *pointer, uint32_t time, wl_fixed_t sx, wl_fixed_t sy);
+
+    static void
+    handle_pointer_button(void *data, struct wl_pointer *wl_pointer, uint32_t serial, uint32_t time, uint32_t button,
+                          uint32_t state);
+
+    static void
+    handle_pointer_axis(void *data, struct wl_pointer *wl_pointer, uint32_t time, uint32_t axis, wl_fixed_t value);
+
+    static void handle_pointer_frame(void *data, struct wl_pointer *wl_pointer) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+    }
+
+    static void handle_pointer_axis_source(void *data, struct wl_pointer *wl_pointer, uint32_t axis_source) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+        CV_UNUSED(axis_source);
+    }
+
+    static void handle_pointer_axis_stop(void *data, struct wl_pointer *wl_pointer, uint32_t time, uint32_t axis) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+        CV_UNUSED(time);
+        CV_UNUSED(axis);
+    }
+
+    static void
+    handle_pointer_axis_discrete(void *data, struct wl_pointer *wl_pointer, uint32_t axis, int32_t discrete) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+        CV_UNUSED(axis);
+        CV_UNUSED(discrete);
+    }
+};
+
+class cv_wl_keyboard {
+public:
+    enum {
+        MOD_SHIFT_MASK = 0x01,
+        MOD_ALT_MASK = 0x02,
+        MOD_CONTROL_MASK = 0x04
+    };
+
+    explicit cv_wl_keyboard(struct wl_keyboard *keyboard);
+
+    ~cv_wl_keyboard();
+
+    uint32_t get_modifiers() const;
+
+    std::queue<int> get_key_queue();
+
+private:
+    struct {
+        struct xkb_context *ctx;
+        struct xkb_keymap *keymap;
+        struct xkb_state *state;
+        xkb_mod_mask_t control_mask;
+        xkb_mod_mask_t alt_mask;
+        xkb_mod_mask_t shift_mask;
+    } xkb_{nullptr, nullptr, nullptr, 0, 0, 0};
+    struct wl_keyboard *keyboard_ = nullptr;
+    struct wl_keyboard_listener keyboard_listener_{
+            &handle_kb_keymap, &handle_kb_enter, &handle_kb_leave,
+            &handle_kb_key, &handle_kb_modifiers, &handle_kb_repeat
+    };
+    uint32_t modifiers_{};
+    std::queue<int> key_queue_;
+
+    static void handle_kb_keymap(void *data, struct wl_keyboard *kb, uint32_t format, int fd, uint32_t size);
+
+    static void handle_kb_enter(void *data, struct wl_keyboard *keyboard, uint32_t serial, struct wl_surface *surface,
+                                struct wl_array *keys);
+
+    static void handle_kb_leave(void *data, struct wl_keyboard *keyboard, uint32_t serial, struct wl_surface *surface);
+
+    static void handle_kb_key(void *data, struct wl_keyboard *keyboard, uint32_t serial, uint32_t time, uint32_t key,
+                              uint32_t state);
+
+    static void handle_kb_modifiers(void *data, struct wl_keyboard *keyboard,
+                                    uint32_t serial, uint32_t mods_depressed, uint32_t mods_latched,
+                                    uint32_t mods_locked, uint32_t group);
+
+    static void handle_kb_repeat(void *data, struct wl_keyboard *wl_keyboard, int32_t rate, int32_t delay);
+};
+
+class cv_wl_input {
+public:
+    explicit cv_wl_input(struct wl_seat *seat);
+
+    ~cv_wl_input();
+
+    struct wl_seat *seat() const { return seat_; }
+
+    weak_ptr<cv_wl_mouse> mouse();
+
+    weak_ptr<cv_wl_keyboard> keyboard();
+
+private:
+    struct wl_seat *seat_;
+    struct wl_seat_listener seat_listener_{
+            &handle_seat_capabilities, &handle_seat_name
+    };
+    shared_ptr<cv_wl_mouse> mouse_;
+    shared_ptr<cv_wl_keyboard> keyboard_;
+
+    static void handle_seat_capabilities(void *data, struct wl_seat *wl_seat, uint32_t caps);
+
+    static void handle_seat_name(void *data, struct wl_seat *wl_seat, const char *name);
+};
+
+class cv_wl_buffer {
+public:
+    cv_wl_buffer();
+
+    ~cv_wl_buffer();
+
+    void destroy();
+
+    void busy(bool busy = true);
+
+    bool is_busy() const;
+
+    cv::Size size() const;
+
+    bool is_allocated() const;
+
+    char *data();
+
+    void create_shm(struct wl_shm *shm, cv::Size size, uint32_t format);
+
+    void attach_to_surface(struct wl_surface *surface, int32_t x, int32_t y);
+
+private:
+    int fd_ = -1;
+    bool busy_ = false;
+    cv::Size size_{0, 0};
+    struct wl_buffer *buffer_ = nullptr;
+    struct wl_buffer_listener buffer_listener_{
+            &handle_buffer_release
+    };
+    void *shm_data_ = nullptr;
+
+    static int create_tmpfile(std::string const &tmpname);
+
+    static int create_anonymous_file(off_t size);
+
+    static void handle_buffer_release(void *data, struct wl_buffer *buffer);
+};
+
+struct cv_wl_cursor {
+public:
+    friend cv_wl_cursor_theme;
+
+    ~cv_wl_cursor();
+
+    std::string const &name() const;
+
+    void set_to_mouse(cv_wl_mouse &mouse, uint32_t serial);
+
+    void commit(int image_index = 0);
+
+private:
+    std::string name_;
+    struct wl_cursor *cursor_;
+    struct wl_surface *surface_;
+    struct wl_callback *frame_callback_ = nullptr;
+    struct wl_callback_listener frame_listener_{
+            &handle_cursor_frame
+    };
+
+    cv_wl_cursor(weak_ptr<cv_wl_display> const &, struct wl_cursor *, std::string);
+
+    static void handle_cursor_frame(void *data, struct wl_callback *cb, uint32_t time);
+};
+
+class cv_wl_cursor_theme {
+public:
+    cv_wl_cursor_theme(weak_ptr<cv_wl_display> const &display, std::string const &theme, int size = 32);
+
+    ~cv_wl_cursor_theme();
+
+    int size() const;
+
+    std::string const &name() const;
+
+    weak_ptr<cv_wl_cursor> get_cursor(std::string const &name);
+
+private:
+    int size_;
+    std::string name_;
+    weak_ptr<cv_wl_display> display_;
+    struct wl_cursor_theme *cursor_theme_ = nullptr;
+
+    std::unordered_map<std::string, shared_ptr<cv_wl_cursor>> cursors_;
+};
+
+/*
+ * height_for_width widget management
+ */
+class cv_wl_widget {
+public:
+    explicit cv_wl_widget(cv_wl_window *window)
+            : window_(window) {
+    }
+
+    virtual ~cv_wl_widget() = default;
+
+    /* Return the size the last time when we did drawing */
+    /* draw method must update the last_size_ */
+    virtual cv::Size get_last_size() const {
+        return last_size_;
+    }
+
+    virtual void get_preferred_width(int &minimum, int &natural) const = 0;
+
+    virtual void get_preferred_height_for_width(int width, int &minimum, int &natural) const = 0;
+
+    virtual void on_mouse(int event, cv::Point const &p, int flag) {
+        CV_UNUSED(event);
+        CV_UNUSED(p);
+        CV_UNUSED(flag);
+    }
+
+    /* Return: The area widget rendered, if not rendered at all, set as width=height=0 */
+    virtual cv::Rect draw(void *data, cv::Size const &, bool force) = 0;
+
+protected:
+    cv::Size last_size_{0, 0};
+    cv_wl_window *window_;
+};
+
+class cv_wl_titlebar : public cv_wl_widget {
+public:
+    enum {
+        btn_width = 24,
+        btn_margin = 5,
+        btn_max_x = 8,
+        btn_max_y = 8,
+        titlebar_min_width = btn_width * 3 + btn_margin,
+        titlebar_min_height = 24
+    };
+
+    explicit cv_wl_titlebar(cv_wl_window *window);
+
+    void get_preferred_width(int &minimum, int &natural) const override;
+
+    void get_preferred_height_for_width(int width, int &minimum, int &natural) const override;
+
+    void on_mouse(int event, cv::Point const &p, int flag) override;
+
+    void calc_button_geometry(cv::Size const &size);
+
+    cv::Rect draw(void *data, cv::Size const &size, bool force) override;
+
+private:
+    cv::Mat buf_;
+    cv::Rect btn_close_, btn_max_, btn_min_;
+    cv::Scalar const line_color_ = CV_RGB(0xff, 0xff, 0xff);
+    cv::Scalar const bg_color_ = CV_RGB(0x2d, 0x2d, 0x2d);
+    cv::Scalar const border_color_ = CV_RGB(0x53, 0x63, 0x53);
+
+    std::string last_title_;
+
+    struct {
+        int face = cv::FONT_HERSHEY_TRIPLEX;
+        double scale = 0.4;
+        int thickness = 1;
+        int baseline = 0;
+    } title_;
+};
+
+class cv_wl_viewer : public cv_wl_widget {
+public:
+    enum {
+        MOUSE_CALLBACK_MIN_INTERVAL_MILLISEC = 15
+    };
+
+    cv_wl_viewer(cv_wl_window *, int flags);
+
+    int get_flags() const { return flags_; }
+
+    void set_image(cv::Mat const &image);
+
+    void set_mouse_callback(CvMouseCallback callback, void *param);
+
+    void get_preferred_width(int &minimum, int &natural) const override;
+
+    void get_preferred_height_for_width(int width, int &minimum, int &natural) const override;
+
+    void on_mouse(int event, cv::Point const &p, int flag) override;
+
+    cv::Rect draw(void *data, cv::Size const &, bool force) override;
+
+private:
+    int flags_;
+    cv::Mat image_;
+    cv::Rect last_img_area_;
+    bool image_changed_ = false;
+
+    void *param_ = nullptr;
+    CvMouseCallback callback_ = nullptr;
+};
+
+class cv_wl_trackbar : public cv_wl_widget {
+public:
+    cv_wl_trackbar(cv_wl_window *window, std::string name,
+                   int *value, int count, CvTrackbarCallback2 on_change, void *data);
+
+    std::string const &name() const;
+
+    int get_pos() const;
+
+    void set_pos(int value);
+
+    void set_max(int maxval);
+
+    void get_preferred_width(int &minimum, int &natural) const override;
+
+    void get_preferred_height_for_width(int width, int &minimum, int &natural) const override;
+
+    void on_mouse(int event, cv::Point const &p, int flag) override;
+
+    cv::Rect draw(void *data, cv::Size const &size, bool force) override;
+
+private:
+    std::string name_;
+    int count_;
+    cv::Size size_;
+
+    struct {
+        int *value;
+        void *data;
+        CvTrackbarCallback2 callback;
+
+        void update(int v) const { if (value) *value = v; }
+
+        void call(int v) const { if (callback) callback(v, data); }
+    } on_change_{};
+
+    struct {
+        cv::Scalar bg = CV_RGB(0xa4, 0xa4, 0xa4);
+        cv::Scalar fg = CV_RGB(0xf0, 0xf0, 0xf0);
+    } color_;
+
+    struct {
+        int fontface = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double fontscale = 0.6;
+        int font_thickness = 1;
+        cv::Size text_size;
+        cv::Point text_orig;
+
+        int margin = 10, thickness = 5;
+        cv::Point right, left;
+
+        int length() const { return right.x - left.x; }
+    } bar_;
+
+    struct {
+        int value = 0;
+        int radius = 7;
+        cv::Point pos;
+        bool drag = false;
+    } slider_;
+
+    bool slider_moved_ = true;
+    cv::Mat data_;
+
+    void prepare_to_draw();
+};
+
+struct cv_wl_mouse_callback {
+    bool drag = false;
+    cv::Point last{0, 0};
+    cv_wl_mouse::button button = cv_wl_mouse::button::NONE;
+
+    void reset() {
+        drag = false;
+        last = cv::Point(0, 0);
+        button = cv_wl_mouse::button::NONE;
+    }
+};
+
+struct cv_wl_window_state {
+    cv_wl_window_state() {
+        reset();
+    }
+
+    void reset() {
+        maximized = fullscreen = resizing = focused = false;
+    }
+
+    cv::Size prev_size_{0, 0};
+    bool maximized{}, fullscreen{}, resizing{}, focused{};
+};
+
+class cv_wl_window {
+public:
+    enum {
+        DEFAULT_CURSOR_SIZE = 32
+    };
+
+    cv_wl_window(shared_ptr<cv_wl_display> const &display, std::string title, int flags);
+
+    ~cv_wl_window();
+
+    cv::Size get_size() const;
+
+    std::string const &get_title() const;
+
+    void set_title(std::string const &title);
+
+    cv_wl_window_state const &state() const;
+
+    void show_image(cv::Mat const &image);
+
+    void create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change, void *userdata);
+
+    weak_ptr<cv_wl_trackbar> get_trackbar(std::string const &) const;
+
+    void mouse_enter(cv::Point const &p, uint32_t serial);
+
+    void mouse_leave();
+
+    void mouse_motion(uint32_t time, cv::Point const &p);
+
+    void mouse_button(uint32_t time, uint32_t button, wl_pointer_button_state state, uint32_t serial);
+
+    void update_cursor(cv::Point const &p, bool grab = false);
+
+    void interactive_move();
+
+    void set_mouse_callback(CvMouseCallback on_mouse, void *param);
+
+    void set_minimized();
+
+    void set_maximized(bool maximize = true);
+
+    void show(cv::Size const &new_size = cv::Size(0, 0));
+
+private:
+    cv::Size size_{640, 480};
+    std::string title_;
+
+    shared_ptr<cv_wl_display> display_;
+    struct wl_surface *surface_;
+    struct xdg_surface *xdg_surface_;
+    struct xdg_surface_listener xdgsurf_listener_{
+            &handle_surface_configure,
+    };
+    struct xdg_toplevel *xdg_toplevel_;
+    struct xdg_toplevel_listener xdgtop_listener_{
+            &handle_toplevel_configure, &handle_toplevel_close
+    };
+    bool wait_for_configure_ = true;
+
+    /* double buffered */
+    std::array<cv_wl_buffer, 2> buffers_;
+
+    bool next_frame_ready_ = true;          /* we can now commit a new buffer */
+    struct wl_callback *frame_callback_ = nullptr;
+    struct wl_callback_listener frame_listener_{
+            &handle_frame_callback
+    };
+
+    cv_wl_window_state state_;
+    struct {
+        bool repaint_request = false;  /* we need to redraw as soon as possible (some states are changed) */
+        bool resize_request = false;
+        cv::Size size{0, 0};
+    } pending_;
+
+    shared_ptr<cv_wl_viewer> viewer_;
+    std::vector<shared_ptr<cv_wl_widget>> widgets_;
+    std::vector<cv::Rect> widget_geometries_;
+
+    cv_wl_mouse_callback on_mouse_;
+
+    uint32_t mouse_enter_serial_{};
+    uint32_t mouse_button_serial_{};
+    struct {
+        std::string current_name;
+        cv_wl_cursor_theme theme;
+    } cursor_;
+
+    cv_wl_buffer *next_buffer();
+
+    void commit_buffer(cv_wl_buffer *buffer, cv::Rect const &);
+
+    void deliver_mouse_event(int event, cv::Point const &p, int flag);
+
+    std::tuple<cv::Size, std::vector<cv::Rect>> manage_widget_geometry(cv::Size const &new_size);
+
+    static void handle_surface_configure(void *data, struct xdg_surface *surface, uint32_t serial);
+
+    static void handle_toplevel_configure(void *, struct xdg_toplevel *, int32_t, int32_t, struct wl_array *);
+
+    static void handle_toplevel_close(void *data, struct xdg_toplevel *surface);
+
+    static void handle_frame_callback(void *data, struct wl_callback *cb, uint32_t time);
+};
+
+class cv_wl_core {
+public:
+    cv_wl_core();
+
+    ~cv_wl_core();
+
+    void init();
+
+    cv_wl_display &display();
+
+    std::vector<std::string> get_window_names() const;
+
+    shared_ptr<cv_wl_window> get_window(std::string const &name);
+
+    void *get_window_handle(std::string const &name);
+
+    std::string const &get_window_name(void *handle);
+
+    bool create_window(std::string const &name, int flags);
+
+    bool destroy_window(std::string const &name);
+
+    void destroy_all_windows();
+
+private:
+    shared_ptr<cv_wl_display> display_;
+    std::map<std::string, shared_ptr<cv_wl_window>> windows_;
+    std::map<void *, std::string> handles_;
+};
+
+
+/*
+ * cv_wl_display implementation
+ */
+cv_wl_display::cv_wl_display() {
+    init(nullptr);
+}
+
+cv_wl_display::cv_wl_display(std::string const &display) {
+    init(display.empty() ? nullptr : display.c_str());
+}
+
+cv_wl_display::~cv_wl_display() {
+    wl_shm_destroy(shm_);
+    xdg_wm_base_destroy(xdg_wm_base_);
+    wl_compositor_destroy(compositor_);
+    wl_registry_destroy(registry_);
+    wl_display_flush(display_);
+    input_.reset();
+    wl_display_disconnect(display_);
+}
+
+void cv_wl_display::init(const char *display) {
+    display_ = wl_display_connect(display);
+    if (!display_)
+        throw_system_error("Could not connect to display", errno)
+
+    registry_ = wl_display_get_registry(display_);
+    wl_registry_add_listener(registry_, &reg_listener_, this);
+    wl_display_roundtrip(display_);
+    if (!compositor_ || !shm_ || !xdg_wm_base_ || !input_)
+        CV_Error(StsInternal, "Compositor doesn't have required interfaces");
+
+    wl_display_roundtrip(display_);
+    if (!(formats_ & (1 << WL_SHM_FORMAT_XRGB8888)))
+        CV_Error(StsInternal, "WL_SHM_FORMAT_XRGB32 not available");
+
+    poller_.add(
+            wl_display_get_fd(display_),
+            EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP
+    );
+}
+
+int cv_wl_display::dispatch() {
+    return wl_display_dispatch(display_);
+}
+
+int cv_wl_display::dispatch_pending() {
+    return wl_display_dispatch_pending(display_);
+}
+
+int cv_wl_display::flush() {
+    return wl_display_flush(display_);
+}
+
+int cv_wl_display::run_once() {
+
+    auto d = this->display_;
+
+    while (wl_display_prepare_read(d) != 0) {
+        wl_display_dispatch_pending(d);
+    }
+    wl_display_flush(d);
+
+    wl_display_read_events(d);
+    return  wl_display_dispatch_pending(d);
+}
+
+struct wl_shm *cv_wl_display::shm() {
+    return shm_;
+}
+
+weak_ptr<cv_wl_input> cv_wl_display::input() {
+    return input_;
+}
+
+uint32_t cv_wl_display::formats() const {
+    return formats_;
+}
+
+struct wl_surface *cv_wl_display::get_surface() {
+    return wl_compositor_create_surface(compositor_);
+}
+
+struct xdg_surface *cv_wl_display::get_shell_surface(struct wl_surface *surface) {
+    return xdg_wm_base_get_xdg_surface(xdg_wm_base_, surface);
+}
+
+void cv_wl_display::handle_reg_global(void *data, struct wl_registry *reg, uint32_t name, const char *iface,
+                                      uint32_t version) {
+    std::string const interface = iface;
+    auto *display = reinterpret_cast<cv_wl_display *>(data);
+
+    if (interface == wl_compositor_interface.name) {
+        if (version >= 3) {
+            display->compositor_ = static_cast<struct wl_compositor *>(
+                    wl_registry_bind(reg, name,
+                                     &wl_compositor_interface,
+                                     std::min(static_cast<uint32_t>(3), version)));
+            display->buffer_scale_enable_ = true;
+        } else {
+            display->compositor_ = static_cast<struct wl_compositor *>(
+                    wl_registry_bind(reg, name,
+                                     &wl_compositor_interface,
+                                     std::min(static_cast<uint32_t>(2), version)));
+        }
+
+    } else if (interface == wl_shm_interface.name) {
+        display->shm_ = static_cast<struct wl_shm *>(
+                wl_registry_bind(reg, name,
+                                 &wl_shm_interface,
+                                 std::min(static_cast<uint32_t>(1), version)));
+        wl_shm_add_listener(display->shm_, &display->shm_listener_, display);
+    } else if (interface == xdg_wm_base_interface.name) {
+        display->xdg_wm_base_ = static_cast<struct xdg_wm_base *>(
+                wl_registry_bind(reg, name,
+                                 &xdg_wm_base_interface,
+                                 std::min(static_cast<uint32_t>(3), version)));
+        xdg_wm_base_add_listener(display->xdg_wm_base_, &display->xdg_wm_base_listener_, display);
+    } else if (interface == wl_seat_interface.name) {
+        auto *seat = static_cast<struct wl_seat *>(
+                wl_registry_bind(reg, name,
+                                 &wl_seat_interface,
+                                 std::min(static_cast<uint32_t>(4), version)));
+        display->input_ = std::make_shared<cv_wl_input>(seat);
+    }
+}
+
+void cv_wl_display::handle_reg_remove(void *data, struct wl_registry *wl_registry, uint32_t name) {
+    CV_UNUSED(data);
+    CV_UNUSED(wl_registry);
+    CV_UNUSED(name);
+}
+
+void cv_wl_display::handle_shm_format(void *data, struct wl_shm *wl_shm, uint32_t format) {
+    CV_UNUSED(wl_shm);
+    auto *display = reinterpret_cast<cv_wl_display *>(data);
+    display->formats_ |= (1 << format);
+}
+
+void cv_wl_display::handle_shell_ping(void *data, struct xdg_wm_base *shell, uint32_t serial) {
+    CV_UNUSED(data);
+    xdg_wm_base_pong(shell, serial);
+}
+
+
+/*
+ * cv_wl_mouse implementation
+ */
+cv_wl_mouse::cv_wl_mouse(struct wl_pointer *pointer)
+        : pointer_(pointer) {
+    wl_pointer_add_listener(pointer_, &pointer_listener_, this);
+}
+
+cv_wl_mouse::~cv_wl_mouse() {
+    wl_pointer_destroy(pointer_);
+}
+
+void cv_wl_mouse::set_cursor(uint32_t serial, struct wl_surface *surface, int32_t hotspot_x, int32_t hotspot_y) {
+    wl_pointer_set_cursor(pointer_, serial, surface, hotspot_x, hotspot_y);
+}
+
+void cv_wl_mouse::handle_pointer_enter(void *data, struct wl_pointer *pointer,
+                                       uint32_t serial, struct wl_surface *surface, wl_fixed_t sx, wl_fixed_t sy) {
+    CV_UNUSED(pointer);
+    int x = wl_fixed_to_int(sx);
+    int y = wl_fixed_to_int(sy);
+    auto *mouse = reinterpret_cast<cv_wl_mouse *>(data);
+    auto *window = reinterpret_cast<cv_wl_window *>(wl_surface_get_user_data(surface));
+
+    mouse->focus_window_ = window;
+    mouse->focus_window_->mouse_enter(cv::Point(x, y), serial);
+}
+
+void cv_wl_mouse::handle_pointer_leave(void *data,
+                                       struct wl_pointer *pointer, uint32_t serial, struct wl_surface *surface) {
+    CV_UNUSED(pointer);
+    CV_UNUSED(serial);
+    CV_UNUSED(surface);
+    auto *mouse = reinterpret_cast<cv_wl_mouse *>(data);
+
+    mouse->focus_window_->mouse_leave();
+    mouse->focus_window_ = nullptr;
+}
+
+void cv_wl_mouse::handle_pointer_motion(void *data,
+                                        struct wl_pointer *pointer, uint32_t time, wl_fixed_t sx, wl_fixed_t sy) {
+    CV_UNUSED(pointer);
+    CV_UNUSED(time);
+    int x = wl_fixed_to_int(sx);
+    int y = wl_fixed_to_int(sy);
+    auto *mouse = reinterpret_cast<cv_wl_mouse *>(data);
+
+    mouse->focus_window_->mouse_motion(time, cv::Point(x, y));
+}
+
+void cv_wl_mouse::handle_pointer_button(void *data, struct wl_pointer *wl_pointer,
+                                        uint32_t serial, uint32_t time, uint32_t button, uint32_t state) {
+    CV_UNUSED(data);
+    CV_UNUSED(wl_pointer);
+    auto *mouse = reinterpret_cast<cv_wl_mouse *>(data);
+
+    mouse->focus_window_->mouse_button(
+            time, button,
+            static_cast<wl_pointer_button_state>(state),
+            serial
+    );
+}
+
+void cv_wl_mouse::handle_pointer_axis(void *data, struct wl_pointer *wl_pointer,
+                                      uint32_t time, uint32_t axis, wl_fixed_t value) {
+    CV_UNUSED(data);
+    CV_UNUSED(wl_pointer);
+    CV_UNUSED(time);
+    CV_UNUSED(axis);
+    CV_UNUSED(value);
+    /* TODO: Support scroll events */
+}
+
+
+/*
+ * cv_wl_keyboard implementation
+ */
+cv_wl_keyboard::cv_wl_keyboard(struct wl_keyboard *keyboard)
+        : keyboard_(keyboard) {
+    xkb_.ctx = xkb_context_new(XKB_CONTEXT_NO_FLAGS);
+    if (!xkb_.ctx)
+        CV_Error(StsNoMem, "Failed to create xkb context");
+    wl_keyboard_add_listener(keyboard_, &keyboard_listener_, this);
+}
+
+cv_wl_keyboard::~cv_wl_keyboard() {
+    if (xkb_.state)
+        xkb_state_unref(xkb_.state);
+    if (xkb_.keymap)
+        xkb_keymap_unref(xkb_.keymap);
+    if (xkb_.ctx)
+        xkb_context_unref(xkb_.ctx);
+    wl_keyboard_destroy(keyboard_);
+}
+
+uint32_t cv_wl_keyboard::get_modifiers() const {
+    return modifiers_;
+}
+
+std::queue<int> cv_wl_keyboard::get_key_queue() {
+    return std::move(key_queue_);
+}
+
+void cv_wl_keyboard::handle_kb_keymap(void *data, struct wl_keyboard *kb, uint32_t format, int fd, uint32_t size) {
+    CV_UNUSED(kb);
+    auto *keyboard = reinterpret_cast<cv_wl_keyboard *>(data);
+
+    try {
+        if (format != WL_KEYBOARD_KEYMAP_FORMAT_XKB_V1)
+            CV_Error(StsInternal, "XKB_V1 keymap format unavailable");
+
+        char *map_str = (char *) mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
+        if (map_str == MAP_FAILED)
+            CV_Error(StsInternal, "Failed to mmap keymap");
+
+        keyboard->xkb_.keymap = xkb_keymap_new_from_string(
+                keyboard->xkb_.ctx, map_str, XKB_KEYMAP_FORMAT_TEXT_V1, XKB_KEYMAP_COMPILE_NO_FLAGS);
+        munmap(map_str, size);
+        if (!keyboard->xkb_.keymap)
+            CV_Error(StsInternal, "Failed to compile keymap");
+
+        keyboard->xkb_.state = xkb_state_new(keyboard->xkb_.keymap);
+        if (!keyboard->xkb_.state)
+            CV_Error(StsNoMem, "Failed to create XKB state");
+
+        keyboard->xkb_.control_mask =
+                1 << xkb_keymap_mod_get_index(keyboard->xkb_.keymap, "Control");
+        keyboard->xkb_.alt_mask =
+                1 << xkb_keymap_mod_get_index(keyboard->xkb_.keymap, "Mod1");
+        keyboard->xkb_.shift_mask =
+                1 << xkb_keymap_mod_get_index(keyboard->xkb_.keymap, "Shift");
+    } catch (std::exception &e) {
+        if (keyboard->xkb_.keymap)
+            xkb_keymap_unref(keyboard->xkb_.keymap);
+        std::cerr << "OpenCV Error: " << e.what() << std::endl;
+    }
+
+    close(fd);
+}
+
+void
+cv_wl_keyboard::handle_kb_enter(void *data, struct wl_keyboard *keyboard, uint32_t serial, struct wl_surface *surface,
+                                struct wl_array *keys) {
+    CV_UNUSED(data);
+    CV_UNUSED(keyboard);
+    CV_UNUSED(serial);
+    CV_UNUSED(surface);
+    CV_UNUSED(keys);
+}
+
+void
+cv_wl_keyboard::handle_kb_leave(void *data, struct wl_keyboard *keyboard, uint32_t serial, struct wl_surface *surface) {
+    CV_UNUSED(data);
+    CV_UNUSED(keyboard);
+    CV_UNUSED(serial);
+    CV_UNUSED(surface);
+}
+
+void
+cv_wl_keyboard::handle_kb_key(void *data, struct wl_keyboard *keyboard, uint32_t serial, uint32_t time, uint32_t key,
+                              uint32_t state) {
+    CV_UNUSED(keyboard);
+    CV_UNUSED(serial);
+    CV_UNUSED(time);
+    auto *kb = reinterpret_cast<cv_wl_keyboard *>(data);
+    xkb_keycode_t keycode = key + 8;
+
+    if (state == WL_KEYBOARD_KEY_STATE_RELEASED) {
+        xkb_keysym_t keysym = xkb_state_key_get_one_sym(kb->xkb_.state, keycode);
+        kb->key_queue_.push(xkb_keysym_to_ascii(keysym));
+    }
+}
+
+void cv_wl_keyboard::handle_kb_modifiers(void *data, struct wl_keyboard *keyboard,
+                                         uint32_t serial, uint32_t mods_depressed,
+                                         uint32_t mods_latched, uint32_t mods_locked,
+                                         uint32_t group) {
+    CV_UNUSED(keyboard);
+    CV_UNUSED(serial);
+    auto *kb = reinterpret_cast<cv_wl_keyboard *>(data);
+
+    if (!kb->xkb_.keymap)
+        return;
+
+    xkb_state_update_mask(
+            kb->xkb_.state, mods_depressed,
+            mods_latched, mods_locked, 0, 0, group
+    );
+
+    xkb_mod_mask_t mask = xkb_state_serialize_mods(
+            kb->xkb_.state,
+            static_cast<xkb_state_component>(
+                    XKB_STATE_MODS_DEPRESSED | XKB_STATE_MODS_LATCHED)
+    );
+
+    kb->modifiers_ = 0;
+    if (mask & kb->xkb_.control_mask)
+        kb->modifiers_ |= cv_wl_keyboard::MOD_CONTROL_MASK;
+    if (mask & kb->xkb_.alt_mask)
+        kb->modifiers_ |= cv_wl_keyboard::MOD_ALT_MASK;
+    if (mask & kb->xkb_.shift_mask)
+        kb->modifiers_ |= cv_wl_keyboard::MOD_SHIFT_MASK;
+}
+
+void cv_wl_keyboard::handle_kb_repeat(void *data, struct wl_keyboard *wl_keyboard, int32_t rate, int32_t delay) {
+    CV_UNUSED(data);
+    CV_UNUSED(wl_keyboard);
+    CV_UNUSED(rate);
+    CV_UNUSED(delay);
+}
+
+
+/*
+ * cv_wl_input implementation
+ */
+cv_wl_input::cv_wl_input(struct wl_seat *seat)
+        : seat_(seat) {
+    wl_seat_add_listener(seat_, &seat_listener_, this);
+}
+
+cv_wl_input::~cv_wl_input() {
+    mouse_.reset();
+    keyboard_.reset();
+    wl_seat_destroy(seat_);
+}
+
+weak_ptr<cv_wl_mouse> cv_wl_input::mouse() {
+    return mouse_;
+}
+
+weak_ptr<cv_wl_keyboard> cv_wl_input::keyboard() {
+    return keyboard_;
+}
+
+void cv_wl_input::handle_seat_capabilities(void *data, struct wl_seat *wl_seat, uint32_t caps) {
+    CV_UNUSED(wl_seat);
+    auto *input = reinterpret_cast<cv_wl_input *>(data);
+
+    if (caps & WL_SEAT_CAPABILITY_POINTER) {
+        struct wl_pointer *pointer = wl_seat_get_pointer(input->seat_);
+        input->mouse_ = std::make_shared<cv_wl_mouse>(pointer);
+    }
+
+    if (caps & WL_SEAT_CAPABILITY_KEYBOARD) {
+        struct wl_keyboard *keyboard = wl_seat_get_keyboard(input->seat_);
+        input->keyboard_ = std::make_shared<cv_wl_keyboard>(keyboard);
+    }
+}
+
+void cv_wl_input::handle_seat_name(void *data, struct wl_seat *wl_seat, const char *name) {
+    CV_UNUSED(data);
+    CV_UNUSED(wl_seat);
+    CV_UNUSED(name);
+}
+
+
+/*
+ * cv_wl_buffer implementation
+ */
+cv_wl_buffer::cv_wl_buffer()
+= default;
+
+cv_wl_buffer::~cv_wl_buffer() {
+    this->destroy();
+}
+
+void cv_wl_buffer::destroy() {
+    if (buffer_) {
+        wl_buffer_destroy(buffer_);
+        buffer_ = nullptr;
+    }
+
+    if (fd_ >= 0) {
+        close(fd_);
+        fd_ = -1;
+    }
+
+    if (shm_data_ && shm_data_ != MAP_FAILED) {
+        munmap(shm_data_, size_.area() * 4);
+        shm_data_ = nullptr;
+    }
+
+    size_.width = size_.height = 0;
+}
+
+void cv_wl_buffer::busy(bool busy) {
+    busy_ = busy;
+}
+
+bool cv_wl_buffer::is_busy() const {
+    return busy_;
+}
+
+cv::Size cv_wl_buffer::size() const {
+    return size_;
+}
+
+bool cv_wl_buffer::is_allocated() const {
+    return buffer_ && shm_data_;
+}
+
+char *cv_wl_buffer::data() {
+    return (char *) shm_data_;
+}
+
+void cv_wl_buffer::create_shm(struct wl_shm *shm, cv::Size size, uint32_t format) {
+    this->destroy();
+
+    size_ = size;
+    int stride = size_.width * 4;
+    int buffer_size = stride * size_.height;
+
+    fd_ = cv_wl_buffer::create_anonymous_file(buffer_size);
+
+    shm_data_ = mmap(nullptr, buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0);
+    if (shm_data_ == MAP_FAILED) {
+        int errno_ = errno;
+        this->destroy();
+        throw_system_error("failed to map shm", errno_)
+    }
+
+    struct wl_shm_pool *pool = wl_shm_create_pool(shm, fd_, buffer_size);
+    buffer_ = wl_shm_pool_create_buffer(pool, 0, size_.width, size_.height, stride, format);
+    wl_buffer_add_listener(buffer_, &buffer_listener_, this);
+    wl_shm_pool_destroy(pool);
+}
+
+void cv_wl_buffer::attach_to_surface(struct wl_surface *surface, int32_t x, int32_t y) {
+    wl_surface_attach(surface, buffer_, x, y);
+    this->busy();
+}
+
+int cv_wl_buffer::create_tmpfile(std::string const &tmpname) {
+    std::vector<char> filename(tmpname.begin(), tmpname.end());
+    filename.push_back('\0');
+
+    int fd = mkostemp(filename.data(), O_CLOEXEC);
+    if (fd >= 0)
+        unlink(filename.data());
+    else
+        CV_Error_(StsInternal,
+                  ("Failed to create a tmp file: %s: %s",
+                          tmpname.c_str(), strerror(errno)));
+    return fd;
+}
+
+int cv_wl_buffer::create_anonymous_file(off_t size) {
+    auto path = getenv("XDG_RUNTIME_DIR") + std::string("/opencv-shared-XXXXXX");
+    int fd = create_tmpfile(path);
+
+    int ret = posix_fallocate(fd, 0, size);
+    if (ret != 0) {
+        close(fd);
+        throw_system_error("Failed to fallocate shm", errno)
+    }
+
+    return fd;
+}
+
+void cv_wl_buffer::handle_buffer_release(void *data, struct wl_buffer *buffer) {
+    CV_UNUSED(buffer);
+    auto *cvbuf = reinterpret_cast<cv_wl_buffer *>(data);
+
+    cvbuf->busy(false);
+}
+
+
+/*
+ * cv_wl_cursor implementation
+ */
+cv_wl_cursor::cv_wl_cursor(weak_ptr<cv_wl_display> const &display, struct wl_cursor *cursor, std::string name)
+        : name_(std::move(name)), cursor_(cursor) {
+    surface_ = display.lock()->get_surface();
+}
+
+cv_wl_cursor::~cv_wl_cursor() {
+    if (frame_callback_)
+        wl_callback_destroy(frame_callback_);
+    wl_surface_destroy(surface_);
+}
+
+std::string const &cv_wl_cursor::name() const {
+    return name_;
+}
+
+void cv_wl_cursor::set_to_mouse(cv_wl_mouse &mouse, uint32_t serial) {
+    auto *cursor_img = cursor_->images[0];
+    mouse.set_cursor(serial, surface_, static_cast<int32_t>(cursor_img->hotspot_x),
+                     static_cast<int32_t>(cursor_img->hotspot_y));
+}
+
+void cv_wl_cursor::commit(int image_index) {
+    auto *cursor_img = cursor_->images[image_index];
+    auto *cursor_buffer = wl_cursor_image_get_buffer(cursor_img);
+
+    if (cursor_->image_count > 1) {
+        if (frame_callback_)
+            wl_callback_destroy(frame_callback_);
+
+        frame_callback_ = wl_surface_frame(surface_);
+        wl_callback_add_listener(frame_callback_, &frame_listener_, this);
+    }
+
+    wl_surface_attach(surface_, cursor_buffer, 0, 0);
+    wl_surface_damage(surface_, 0, 0, static_cast<int32_t>(cursor_img->width),
+                      static_cast<int32_t>(cursor_img->height));
+    wl_surface_commit(surface_);
+}
+
+void cv_wl_cursor::handle_cursor_frame(void *data, struct wl_callback *cb, uint32_t time) {
+    CV_UNUSED(cb);
+    auto *cursor = (struct cv_wl_cursor *) data;
+    int image_index = wl_cursor_frame(cursor->cursor_, time);
+
+    cursor->commit(image_index);
+}
+
+
+/*
+ * cv_wl_cursor_theme implementation
+ */
+cv_wl_cursor_theme::cv_wl_cursor_theme(weak_ptr<cv_wl_display> const &display, std::string const &theme, int size)
+        : size_(size), name_(theme), display_(display), cursors_() {
+    cursor_theme_ = wl_cursor_theme_load(theme.c_str(), size, display.lock()->shm());
+    if (!cursor_theme_)
+        CV_Error_(StsInternal, ("Couldn't load cursor theme: %s", theme.c_str()));
+}
+
+cv_wl_cursor_theme::~cv_wl_cursor_theme() {
+    if (cursor_theme_)
+        wl_cursor_theme_destroy(cursor_theme_);
+}
+
+int cv_wl_cursor_theme::size() const {
+    return size_;
+}
+
+std::string const &cv_wl_cursor_theme::name() const {
+    return name_;
+}
+
+weak_ptr<cv_wl_cursor> cv_wl_cursor_theme::get_cursor(std::string const &name) {
+    if (cursors_.count(name) == 1)
+        return cursors_[name];
+
+    auto *wlcursor = wl_cursor_theme_get_cursor(cursor_theme_, name.c_str());
+    if (!wlcursor)
+        CV_Error_(StsInternal, ("Couldn't load cursor: %s", name.c_str()));
+
+    auto cursor =
+            shared_ptr<cv_wl_cursor>(new cv_wl_cursor(display_, wlcursor, name));
+    if (!cursor)
+        CV_Error_(StsInternal, ("Couldn't allocate memory for cursor: %s", name.c_str()));
+
+    cursors_[name] = cursor;
+
+    return cursor;
+}
+
+
+/*
+ * cv_wl_titlebar implementation
+ */
+cv_wl_titlebar::cv_wl_titlebar(cv_wl_window *window) : cv_wl_widget(window) {
+    CV_UNUSED(window);
+}
+
+void cv_wl_titlebar::get_preferred_width(int &minimum, int &natural) const {
+    minimum = natural = titlebar_min_width;
+}
+
+void cv_wl_titlebar::get_preferred_height_for_width(int width, int &minimum, int &natural) const {
+    CV_UNUSED(width);
+    minimum = natural = titlebar_min_height;
+}
+
+void cv_wl_titlebar::on_mouse(int event, cv::Point const &p, int flag) {
+    CV_UNUSED(flag);
+    if (event == cv::EVENT_LBUTTONDOWN) {
+        if (btn_close_.contains(p)) {
+            exit(EXIT_SUCCESS);
+        } else if (btn_max_.contains(p)) {
+            window_->set_maximized(!window_->state().maximized);
+        } else if (btn_min_.contains(p)) {
+            window_->set_minimized();
+        } else {
+            window_->update_cursor(p, true);
+            window_->interactive_move();
+        }
+    }
+}
+
+void cv_wl_titlebar::calc_button_geometry(cv::Size const &size) {
+    /* Basic button geoemetries */
+    cv::Size btn_size = cv::Size(btn_width, size.height);
+    btn_close_ = cv::Rect(cv::Point(size.width - 5 - btn_size.width, 0), btn_size);
+    btn_max_ = cv::Rect(cv::Point(btn_close_.x - btn_size.width, 0), btn_size);
+    btn_min_ = cv::Rect(cv::Point(btn_max_.x - btn_size.width, 0), btn_size);
+}
+
+cv::Rect cv_wl_titlebar::draw(void *data, cv::Size const &size, bool force) {
+    auto damage = cv::Rect(0, 0, 0, 0);
+
+    if (force || last_size_ != size || last_title_ != window_->get_title()) {
+        buf_ = cv::Mat(size, CV_8UC3, bg_color_);
+        this->calc_button_geometry(size);
+
+        auto const margin = cv::Point(btn_max_x, btn_max_y);
+        auto const btn_cls = cv::Rect(btn_close_.tl() + margin, btn_close_.br() - margin);
+        auto const btn_max = cv::Rect(btn_max_.tl() + margin, btn_max_.br() - margin);
+        auto title_area = cv::Rect(0, 0, size.width - titlebar_min_width, size.height);
+
+        auto text = cv::getTextSize(window_->get_title(), title_.face, title_.scale, title_.thickness,
+                                    &title_.baseline);
+        if (text.area() <= title_area.area()) {
+            auto origin = cv::Point(0, (size.height + text.height) / 2);
+            origin.x = ((title_area.width >= (size.width + text.width) / 2) ?
+                        (size.width - text.width) / 2 : (title_area.width - text.width) / 2);
+            cv::putText(
+                    buf_, window_->get_title(),
+                    origin, title_.face, title_.scale,
+                    CV_RGB(0xff, 0xff, 0xff), title_.thickness, CV_AA
+            );
+        }
+
+        buf_(cv::Rect(btn_min_.tl(), cv::Size(titlebar_min_width, size.height))) = bg_color_;
+        cv::line(buf_, btn_cls.tl(), btn_cls.br(), line_color_, 1, CV_AA);
+        cv::line(buf_, btn_cls.tl() + cv::Point(btn_cls.width, 0), btn_cls.br() - cv::Point(btn_cls.width, 0),
+                 line_color_, 1, CV_AA);
+        cv::rectangle(buf_, btn_max.tl(), btn_max.br(), line_color_, 1, CV_AA);
+        cv::line(buf_, cv::Point(btn_min_.x + 8, btn_min_.height / 2),
+                 cv::Point(btn_min_.x + btn_min_.width - 8, btn_min_.height / 2), line_color_, 1, CV_AA);
+        cv::line(buf_, cv::Point(0, 0), cv::Point(buf_.size().width, 0), border_color_, 1, CV_AA);
+
+        write_mat_to_xrgb8888(buf_, data);
+        last_size_ = size;
+        last_title_ = window_->get_title();
+        damage = cv::Rect(cv::Point(0, 0), size);
+    }
+
+    return damage;
+}
+
+
+/*
+ * cv_wl_viewer implementation
+ */
+cv_wl_viewer::cv_wl_viewer(cv_wl_window *window, int flags)
+        : cv_wl_widget(window), flags_(flags) {
+}
+
+void cv_wl_viewer::set_image(cv::Mat const &image) {
+    if (image.type() == CV_8UC1) {
+        cv::Mat bgr;
+        cv::cvtColor(image, bgr, CV_GRAY2BGR);
+        image_ = bgr.clone();
+    } else {
+        image_ = image.clone();
+    }
+    image_changed_ = true;
+}
+
+void cv_wl_viewer::set_mouse_callback(CvMouseCallback callback, void *param) {
+    param_ = param;
+    callback_ = callback;
+}
+
+void cv_wl_viewer::get_preferred_width(int &minimum, int &natural) const {
+    if (image_.size().area() == 0) {
+        minimum = natural = 0;
+    } else {
+        natural = image_.size().width;
+        minimum = (flags_ == cv::WINDOW_AUTOSIZE ? natural : 0);
+    }
+}
+
+static double aspect_ratio(cv::Size const &size) {
+    return (double) size.height / (double) size.width;
+}
+
+void cv_wl_viewer::get_preferred_height_for_width(int width, int &minimum, int &natural) const {
+    if (image_.size().area() == 0) {
+        minimum = natural = 0;
+    } else if (flags_ == cv::WINDOW_AUTOSIZE) {
+        CV_Assert(width == image_.size().width);
+        minimum = natural = image_.size().height;
+    } else {
+        natural = static_cast<int>(width * aspect_ratio(image_.size()));
+        minimum = (flags_ & CV_WINDOW_FREERATIO ? 0 : natural);
+    }
+}
+
+void cv_wl_viewer::on_mouse(int event, cv::Point const &p, int flag) {
+    // Make sure the first mouse event is delivered to clients
+    static int last_event = ~event, last_flag = ~flag;
+    static auto last_event_time = ch::steady_clock::now();
+
+    if (callback_) {
+        auto now = ch::steady_clock::now();
+        auto elapsed = ch::duration_cast<ch::milliseconds>(now - last_event_time);
+
+        /* Inhibit the too frequent mouse callback due to the heavy load */
+        if (event != last_event || flag != last_flag ||
+            elapsed.count() >= MOUSE_CALLBACK_MIN_INTERVAL_MILLISEC) {
+            last_event = event;
+            last_flag = flag;
+            last_event_time = now;
+
+            /* Scale the coordinate to match the client's image coordinate */
+            int x = static_cast<int>((p.x - last_img_area_.x) * ((double) image_.size().width / last_img_area_.width));
+            int y = static_cast<int>((p.y - last_img_area_.y) *
+                                     ((double) image_.size().height / last_img_area_.height));
+            callback_(event, x, y, flag, param_);
+        }
+    }
+}
+
+cv::Rect cv_wl_viewer::draw(void *data, cv::Size const &size, bool force) {
+    if ((!force && !image_changed_ && last_size_ == size) || image_.size().area() == 0 || size.area() == 0)
+        return {0, 0, 0, 0};
+
+    last_img_area_ = cv::Rect(cv::Point(0, 0), size);
+
+    if (flags_ == cv::WINDOW_AUTOSIZE || image_.size() == size) {
+        CV_Assert(image_.size() == size);
+        write_mat_to_xrgb8888(image_, data);
+    } else {
+        if (flags_ & CV_WINDOW_FREERATIO) {
+            cv::Mat resized;
+            cv::resize(image_, resized, size);
+            write_mat_to_xrgb8888(resized, data);
+        } else /* CV_WINDOW_KEEPRATIO */ {
+            auto rect = cv::Rect(cv::Point(0, 0), size);
+            if (aspect_ratio(size) >= aspect_ratio(image_.size())) {
+                rect.height = static_cast<int>(image_.size().height * ((double) rect.width / image_.size().width));
+            } else {
+                rect.height = size.height;
+                rect.width = static_cast<int>(image_.size().width * ((double) rect.height / image_.size().height));
+            }
+            rect.x = (size.width - rect.width) / 2;
+            rect.y = (size.height - rect.height) / 2;
+
+            auto buf = cv::Mat(size, image_.type(), CV_RGB(0xa4, 0xa4, 0xa4));
+            auto resized = buf(rect);
+            cv::resize(image_, resized, rect.size());
+            write_mat_to_xrgb8888(buf, data);
+
+            last_img_area_ = rect;
+        }
+    }
+
+    last_size_ = size;
+    image_changed_ = false;
+
+    return {{0, 0}, size};
+}
+
+
+/*
+ * cv_wl_trackbar implementation
+ */
+cv_wl_trackbar::cv_wl_trackbar(cv_wl_window *window, std::string name,
+                               int *value, int count, CvTrackbarCallback2 on_change, void *data)
+        : cv_wl_widget(window), name_(std::move(name)), count_(count) {
+    on_change_.value = value;
+    on_change_.data = data;
+    on_change_.callback = on_change;
+}
+
+std::string const &cv_wl_trackbar::name() const {
+    return name_;
+}
+
+int cv_wl_trackbar::get_pos() const {
+    return slider_.value;
+}
+
+void cv_wl_trackbar::set_pos(int value) {
+    if (0 <= value && value <= count_) {
+        slider_.value = value;
+        slider_moved_ = true;
+        window_->show();
+    }
+}
+
+void cv_wl_trackbar::set_max(int maxval) {
+    count_ = maxval;
+    if (!(0 <= slider_.value && slider_.value <= count_)) {
+        slider_.value = maxval;
+        slider_moved_ = true;
+        window_->show();
+    }
+}
+
+void cv_wl_trackbar::get_preferred_width(int &minimum, int &natural) const {
+    minimum = natural = 320;
+}
+
+void cv_wl_trackbar::get_preferred_height_for_width(int width, int &minimum, int &natural) const {
+    CV_UNUSED(width);
+    minimum = natural = 40;
+}
+
+void cv_wl_trackbar::prepare_to_draw() {
+    bar_.text_size = cv::getTextSize(
+            name_ + ": " + std::to_string(count_), bar_.fontface,
+            bar_.fontscale, bar_.font_thickness, nullptr);
+    bar_.text_orig = cv::Point(2, (size_.height + bar_.text_size.height) / 2);
+    bar_.left = cv::Point(bar_.text_size.width + 10, size_.height / 2);
+    bar_.right = cv::Point(size_.width - bar_.margin - 1, size_.height / 2);
+
+    int slider_pos_x = static_cast<int>(((double) bar_.length() / count_ * slider_.value));
+    slider_.pos = cv::Point(bar_.left.x + slider_pos_x, bar_.left.y);
+}
+
+cv::Rect cv_wl_trackbar::draw(void *data, cv::Size const &size, bool force) {
+    auto damage = cv::Rect(0, 0, 0, 0);
+
+    if (slider_moved_) {
+        on_change_.update(slider_.value);
+        on_change_.call(slider_.value);
+    }
+
+    if (slider_moved_ || force) {
+        size_ = last_size_ = size;
+
+        if (size_ == data_.size())
+            data_ = CV_RGB(0xde, 0xde, 0xde);
+        else
+            data_ = cv::Mat(size_, CV_8UC3, CV_RGB(0xde, 0xde, 0xde));
+
+        this->prepare_to_draw();
+        cv::putText(
+                data_,
+                (name_ + ": " + std::to_string(slider_.value)),
+                bar_.text_orig, bar_.fontface, bar_.fontscale,
+                CV_RGB(0x00, 0x00, 0x00), bar_.font_thickness, CV_AA);
+
+        cv::line(data_, bar_.left, bar_.right, color_.bg, bar_.thickness + 3, CV_AA);
+        cv::line(data_, bar_.left, bar_.right, color_.fg, bar_.thickness, CV_AA);
+        cv::circle(data_, slider_.pos, slider_.radius, color_.fg, -1, CV_AA);
+        cv::circle(data_, slider_.pos, slider_.radius, color_.bg, 1, CV_AA);
+
+        write_mat_to_xrgb8888(data_, data);
+        damage = cv::Rect(cv::Point(0, 0), size);
+        slider_moved_ = false;
+    }
+
+    return damage;
+}
+
+void cv_wl_trackbar::on_mouse(int event, cv::Point const &p, int flag) {
+    switch (event) {
+        case cv::EVENT_LBUTTONDOWN:
+            slider_.drag = true;
+            window_->update_cursor(p, true);
+            break;
+        case cv::EVENT_MOUSEMOVE:
+            if (!(flag & cv::EVENT_FLAG_LBUTTON))
+                break;
+            break;
+        case cv::EVENT_LBUTTONUP:
+            if (slider_.drag && bar_.left.x <= p.x && p.x <= bar_.right.x) {
+                slider_.value = static_cast<int>((double) (p.x - bar_.left.x) / bar_.length() * count_);
+                slider_moved_ = true;
+                window_->show();
+                slider_.drag = (event != cv::EVENT_LBUTTONUP);
+            }
+            break;
+        default:
+            break;
+    }
+}
+
+
+/*
+ * cv_wl_window implementation
+ */
+cv_wl_window::cv_wl_window(shared_ptr<cv_wl_display> const &display, std::string title, int flags)
+        : title_(std::move(title)), display_(display),
+          surface_(display->get_surface()),
+          cursor_{{},
+                  {display, "default", DEFAULT_CURSOR_SIZE}} {
+    xdg_surface_ = display->get_shell_surface(surface_);
+    if (!xdg_surface_)
+        CV_Error(StsInternal, "Failed to get xdg_surface");
+    xdg_surface_add_listener(xdg_surface_, &xdgsurf_listener_, this);
+
+    xdg_toplevel_ = xdg_surface_get_toplevel(xdg_surface_);
+    if (!xdg_toplevel_)
+        CV_Error(StsInternal, "Failed to get xdg_toplevel");
+    xdg_toplevel_add_listener(xdg_toplevel_, &xdgtop_listener_, this);
+    xdg_toplevel_set_title(xdg_toplevel_, title_.c_str());
+
+    wl_surface_set_user_data(surface_, this);
+
+    widgets_.push_back(std::make_shared<cv_wl_titlebar>(this));
+    widget_geometries_.emplace_back(0, 0, 0, 0);
+
+    viewer_ = std::make_shared<cv_wl_viewer>(this, flags);
+    widget_geometries_.emplace_back(0, 0, 0, 0);
+
+    wl_surface_commit(surface_);
+}
+
+cv_wl_window::~cv_wl_window() {
+    if (frame_callback_)
+        wl_callback_destroy(frame_callback_);
+    xdg_toplevel_destroy(xdg_toplevel_);
+    xdg_surface_destroy(xdg_surface_);
+    wl_surface_destroy(surface_);
+}
+
+cv::Size cv_wl_window::get_size() const {
+    return size_;
+}
+
+std::string const &cv_wl_window::get_title() const {
+    return title_;
+}
+
+void cv_wl_window::set_title(std::string const &title) {
+    title_ = title;
+    xdg_toplevel_set_title(xdg_toplevel_, title_.c_str());
+}
+
+cv_wl_window_state const &cv_wl_window::state() const {
+    return state_;
+}
+
+cv_wl_buffer *cv_wl_window::next_buffer() {
+    cv_wl_buffer *buffer = nullptr;
+
+    if (!buffers_.at(0).is_busy())
+        buffer = &buffers_[0];
+    else if (!buffers_.at(1).is_busy())
+        buffer = &buffers_[1];
+
+    return buffer;
+}
+
+void cv_wl_window::set_mouse_callback(CvMouseCallback on_mouse, void *param) {
+    viewer_->set_mouse_callback(on_mouse, param);
+}
+
+void cv_wl_window::set_minimized() {
+    xdg_toplevel_set_minimized(xdg_toplevel_);
+}
+
+void cv_wl_window::set_maximized(bool maximize) {
+    if (!maximize)
+        xdg_toplevel_unset_maximized(xdg_toplevel_);
+    else if (viewer_->get_flags() != cv::WINDOW_AUTOSIZE)
+        xdg_toplevel_set_maximized(xdg_toplevel_);
+}
+
+void cv_wl_window::show_image(cv::Mat const &image) {
+    viewer_->set_image(image);
+    this->show();
+}
+
+void cv_wl_window::create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change,
+                                   void *userdata) {
+    auto exists = this->get_trackbar(name).lock();
+    if (!exists) {
+        auto trackbar =
+                std::make_shared<cv_wl_trackbar>(
+                        this, name, value, count, on_change, userdata
+                );
+        widgets_.emplace_back(trackbar);
+        widget_geometries_.emplace_back(0, 0, 0, 0);
+    }
+}
+
+weak_ptr<cv_wl_trackbar> cv_wl_window::get_trackbar(std::string const &trackbar_name) const {
+    auto it = std::find_if(widgets_.begin(), widgets_.end(),
+                           [&trackbar_name](shared_ptr<cv_wl_widget> const &widget) {
+                               if (auto trackbar = std::dynamic_pointer_cast<cv_wl_trackbar>(widget))
+                                   return trackbar->name() == trackbar_name;
+                               return false;
+                           });
+    return it == widgets_.end() ? shared_ptr<cv_wl_trackbar>()
+                                : std::static_pointer_cast<cv_wl_trackbar>(*it);
+}
+
+static void calculate_damage(cv::Rect &surface_damage,
+                             cv::Rect const &widget_geometry, cv::Rect const &w_damage) {
+    if (w_damage.area() == 0)
+        return;
+
+    auto widget_damage = w_damage;
+    widget_damage.x += widget_geometry.x;
+    widget_damage.y += widget_geometry.y;
+
+    if (surface_damage.area() == 0) {
+        surface_damage = widget_damage;
+    } else {
+        auto damage = cv::Rect(0, 0, 0, 0);
+        damage.x = std::min(surface_damage.x, widget_damage.x);
+        damage.y = std::min(surface_damage.y, widget_damage.y);
+        damage.width =
+                std::max(surface_damage.x + surface_damage.width, widget_damage.x + widget_damage.width) - damage.x;
+        damage.height =
+                std::max(surface_damage.y + surface_damage.height, widget_damage.y + widget_damage.height) - damage.y;
+
+        surface_damage = damage;
+    }
+}
+
+std::tuple<cv::Size, std::vector<cv::Rect>>
+cv_wl_window::manage_widget_geometry(cv::Size const &new_size) {
+    std::vector<cv::Rect> geometries;
+
+    std::vector<int> min_widths, nat_widths;
+    int min_width, nat_width, min_height, nat_height;
+
+    auto store_preferred_width = [&](shared_ptr<cv_wl_widget> const &widget) {
+        widget->get_preferred_width(min_width, nat_width);
+        min_widths.push_back(min_width);
+        nat_widths.push_back(nat_width);
+    };
+
+    store_preferred_width(viewer_);
+    for (auto &widget: widgets_)
+        store_preferred_width(widget);
+
+    int final_width = 0;
+    int total_height = 0;
+    std::function<void(shared_ptr<cv_wl_widget> const &, int, bool)> calc_geometries;
+
+    auto calc_autosize_geo = [&](shared_ptr<cv_wl_widget> const &widget, int width, bool) {
+        widget->get_preferred_height_for_width(width, min_height, nat_height);
+        geometries.emplace_back(0, total_height, width, nat_height);
+        total_height += nat_height;
+    };
+    auto calc_normal_geo = [&](shared_ptr<cv_wl_widget> const &widget, int width, bool viewer) {
+        widget->get_preferred_height_for_width(width, min_height, nat_height);
+        int height = viewer ? (new_size.height - total_height) : nat_height;
+        geometries.emplace_back(0, total_height, width, height);
+        total_height += height;
+    };
+
+    if (viewer_->get_flags() == cv::WINDOW_AUTOSIZE) {
+        final_width = nat_widths[0];
+        calc_geometries = calc_autosize_geo;
+    } else {
+        int total_min_height = 0;
+        int max_min_width = *std::max_element(min_widths.begin(), min_widths.end());
+        auto calc_total_min_height = [&](shared_ptr<cv_wl_widget> const &widget) {
+            widget->get_preferred_height_for_width(max_min_width, min_height, nat_height);
+            total_min_height += min_height;
+        };
+
+        calc_total_min_height(viewer_);
+        for (auto &widget: widgets_)
+            calc_total_min_height(widget);
+
+        auto min_size = cv::Size(max_min_width, total_min_height);
+        if (new_size.width < min_size.width || new_size.height < min_size.height) {
+            /* The new_size is smaller than the minimum size */
+            return std::make_tuple(cv::Size(0, 0), geometries);
+        } else {
+            final_width = new_size.width;
+            calc_geometries = calc_normal_geo;
+        }
+    }
+
+    for (auto &widget: widgets_)
+        calc_geometries(widget, final_width, false);
+    calc_geometries(viewer_, final_width, true);
+
+    return std::make_tuple(cv::Size(final_width, total_height), geometries);
+}
+
+void cv_wl_window::show(cv::Size const &size) {
+    if (wait_for_configure_) {
+        pending_.repaint_request = true;
+        return;
+    }
+
+    auto *buffer = this->next_buffer();
+    if (!next_frame_ready_ || !buffer) {
+        if (size.area() == 0) {
+            pending_.repaint_request = true;
+        } else {
+            pending_.size = size;
+            pending_.resize_request = true;
+        }
+        return;
+    }
+
+    auto placement =
+            this->manage_widget_geometry(size.area() == 0 ? size_ : size);
+    auto new_size = std::get<0>(placement);
+    auto const &geometries = std::get<1>(placement);
+    if (new_size.area() == 0 || geometries.size() != (widgets_.size() + 1))
+        return;
+
+    bool buffer_size_changed = (buffer->size() != new_size);
+    if (!buffer->is_allocated() || buffer_size_changed)
+        buffer->create_shm(display_->shm(), new_size, WL_SHM_FORMAT_XRGB8888);
+
+    auto surface_damage = cv::Rect(0, 0, 0, 0);
+    auto draw_widget = [&](shared_ptr<cv_wl_widget> const &widget, cv::Rect const &rect) {
+        auto widget_damage = widget->draw(
+                buffer->data() + ((new_size.width * rect.y + rect.x) * 4),
+                rect.size(),
+                buffer_size_changed
+        );
+        calculate_damage(surface_damage, rect, widget_damage);
+    };
+
+    for (size_t i = 0; i < widgets_.size(); ++i)
+        draw_widget(widgets_[i], geometries[i]);
+    draw_widget(viewer_, geometries.back());
+
+    this->commit_buffer(buffer, surface_damage);
+
+    widget_geometries_ = geometries;
+    size_ = new_size;
+}
+
+void cv_wl_window::commit_buffer(cv_wl_buffer *buffer, cv::Rect const &damage) {
+    if (!buffer)
+        return;
+
+    buffer->attach_to_surface(surface_, 0, 0);
+    wl_surface_damage(surface_, damage.x, damage.y, damage.width, damage.height);
+
+    if (frame_callback_)
+        wl_callback_destroy(frame_callback_);
+    frame_callback_ = wl_surface_frame(surface_);
+    wl_callback_add_listener(frame_callback_, &frame_listener_, this);
+
+    next_frame_ready_ = false;
+    wl_surface_commit(surface_);
+}
+
+void cv_wl_window::handle_frame_callback(void *data, struct wl_callback *cb, uint32_t time) {
+    CV_UNUSED(cb);
+    CV_UNUSED(time);
+    auto *window = reinterpret_cast<cv_wl_window *>(data);
+
+    window->next_frame_ready_ = true;
+
+    if (window->pending_.resize_request) {
+        window->pending_.resize_request = false;
+        window->pending_.repaint_request = false;
+        window->show(window->pending_.size);
+    } else if (window->pending_.repaint_request) {
+        window->pending_.repaint_request = false;
+        window->show();
+    }
+}
+
+#define EDGE_AREA_MARGIN 7
+
+static std::string get_cursor_name(int x, int y, cv::Size const &size, bool grab) {
+    std::string cursor;
+
+    if (grab) {
+        cursor = "grabbing";
+    } else if (0 <= y && y <= EDGE_AREA_MARGIN) {
+        cursor = "top_";
+        if (0 <= x && x <= EDGE_AREA_MARGIN)
+            cursor += "left_corner";
+        else if (size.width - EDGE_AREA_MARGIN <= x && x <= size.width)
+            cursor += "right_corner";
+        else
+            cursor += "side";
+    } else if (size.height - EDGE_AREA_MARGIN <= y && y <= size.height) {
+        cursor = "bottom_";
+        if (0 <= x && x <= EDGE_AREA_MARGIN)
+            cursor += "left_corner";
+        else if (size.width - EDGE_AREA_MARGIN <= x && x <= size.width)
+            cursor += "right_corner";
+        else
+            cursor += "side";
+    } else if (0 <= x && x <= EDGE_AREA_MARGIN) {
+        cursor = "left_side";
+    } else if (size.width - EDGE_AREA_MARGIN <= x && x <= size.width) {
+        cursor = "right_side";
+    } else {
+        cursor = "left_ptr";
+    }
+
+    return cursor;
+}
+
+static xdg_toplevel_resize_edge cursor_name_to_enum(std::string const &cursor) {
+
+    if (cursor == "top_left_corner") return XDG_TOPLEVEL_RESIZE_EDGE_TOP_LEFT;
+    else if (cursor == "top_right_corner") return XDG_TOPLEVEL_RESIZE_EDGE_TOP_RIGHT;
+    else if (cursor == "top_side") return XDG_TOPLEVEL_RESIZE_EDGE_TOP;
+    else if (cursor == "bottom_left_corner") return XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM_LEFT;
+    else if (cursor == "bottom_right_corner") return XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM_RIGHT;
+    else if (cursor == "bottom_side") return XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM;
+    else if (cursor == "left_side") return XDG_TOPLEVEL_RESIZE_EDGE_LEFT;
+    else if (cursor == "right_side") return XDG_TOPLEVEL_RESIZE_EDGE_RIGHT;
+    else return XDG_TOPLEVEL_RESIZE_EDGE_NONE;
+}
+
+void cv_wl_window::update_cursor(cv::Point const &p, bool grab) {
+    auto cursor_name = get_cursor_name(p.x, p.y, size_, grab);
+    if (cursor_.current_name == cursor_name)
+        return;
+
+    cursor_.current_name = cursor_name;
+    auto cursor = cursor_.theme.get_cursor(cursor_name);
+    cursor.lock()->set_to_mouse(
+            *display_->input().lock()->mouse().lock(),
+            mouse_enter_serial_
+    );
+    cursor.lock()->commit();
+}
+
+void cv_wl_window::interactive_move() {
+    xdg_toplevel_move(
+            xdg_toplevel_,
+            display_->input().lock()->seat(),
+            mouse_button_serial_
+    );
+}
+
+static int get_kb_modifiers_flag(const weak_ptr<cv_wl_keyboard> &kb) {
+    int flag = 0;
+    auto modifiers = kb.lock()->get_modifiers();
+
+    if (modifiers & cv_wl_keyboard::MOD_CONTROL_MASK)
+        flag |= cv::EVENT_FLAG_CTRLKEY;
+    if (modifiers & cv_wl_keyboard::MOD_ALT_MASK)
+        flag |= cv::EVENT_FLAG_ALTKEY;
+    if (modifiers & cv_wl_keyboard::MOD_SHIFT_MASK)
+        flag |= cv::EVENT_FLAG_SHIFTKEY;
+
+    return flag;
+}
+
+void cv_wl_window::deliver_mouse_event(int event, cv::Point const &p, int flag) {
+    flag |= get_kb_modifiers_flag(display_->input().lock()->keyboard());
+
+    for (size_t i = 0; i < widgets_.size(); ++i) {
+        auto const &rect = widget_geometries_[i];
+        if (rect.contains(p))
+            widgets_[i]->on_mouse(event, p - rect.tl(), flag);
+    }
+
+    auto const &rect = widget_geometries_.back();
+    if (viewer_ && rect.contains(p))
+        viewer_->on_mouse(event, p - rect.tl(), flag);
+}
+
+void cv_wl_window::mouse_enter(cv::Point const &p, uint32_t serial) {
+    on_mouse_.last = p;
+    mouse_enter_serial_ = serial;
+
+    this->update_cursor(p);
+    this->deliver_mouse_event(cv::EVENT_MOUSEMOVE, p, 0);
+}
+
+void cv_wl_window::mouse_leave() {
+    on_mouse_.reset();
+    cursor_.current_name.clear();
+}
+
+void cv_wl_window::mouse_motion(uint32_t time, cv::Point const &p) {
+    CV_UNUSED(time);
+    int flag = 0;
+    on_mouse_.last = p;
+
+    if (on_mouse_.drag) {
+        switch (on_mouse_.button) {
+            case cv_wl_mouse::LBUTTON:
+                flag = cv::EVENT_FLAG_LBUTTON;
+                break;
+            case cv_wl_mouse::RBUTTON:
+                flag = cv::EVENT_FLAG_RBUTTON;
+                break;
+            case cv_wl_mouse::MBUTTON:
+                flag = cv::EVENT_FLAG_MBUTTON;
+                break;
+            default:
+                break;
+        }
+    }
+
+    bool grabbing =
+            (cursor_.current_name == "grabbing" && (flag & cv::EVENT_FLAG_LBUTTON));
+    this->update_cursor(p, grabbing);
+    this->deliver_mouse_event(cv::EVENT_MOUSEMOVE, p, flag);
+}
+
+void cv_wl_window::mouse_button(uint32_t time, uint32_t button, wl_pointer_button_state state, uint32_t serial) {
+    (void) time;
+    int event = 0, flag = 0;
+
+    mouse_button_serial_ = serial;
+
+    /* Start a user-driven, interactive resize of the surface */
+    if (!on_mouse_.drag &&
+        button == cv_wl_mouse::LBUTTON && cursor_.current_name != "left_ptr" &&
+        viewer_->get_flags() != cv::WINDOW_AUTOSIZE) {
+        xdg_toplevel_resize(
+                xdg_toplevel_,
+                display_->input().lock()->seat(),
+                serial,
+                cursor_name_to_enum(cursor_.current_name)
+        );
+        return;
+    }
+
+    on_mouse_.button = static_cast<cv_wl_mouse::button>(button);
+    on_mouse_.drag = (state == WL_POINTER_BUTTON_STATE_PRESSED);
+
+    switch (button) {
+        case cv_wl_mouse::LBUTTON:
+            event = on_mouse_.drag ? cv::EVENT_LBUTTONDOWN : cv::EVENT_LBUTTONUP;
+            flag = cv::EVENT_FLAG_LBUTTON;
+            break;
+        case cv_wl_mouse::RBUTTON:
+            event = on_mouse_.drag ? cv::EVENT_RBUTTONDOWN : cv::EVENT_RBUTTONUP;
+            flag = cv::EVENT_FLAG_RBUTTON;
+            break;
+        case cv_wl_mouse::MBUTTON:
+            event = on_mouse_.drag ? cv::EVENT_MBUTTONDOWN : cv::EVENT_MBUTTONUP;
+            flag = cv::EVENT_FLAG_MBUTTON;
+            break;
+        default:
+            break;
+    }
+
+    this->update_cursor(on_mouse_.last);
+    this->deliver_mouse_event(event, on_mouse_.last, flag);
+}
+
+void cv_wl_window::handle_surface_configure(void *data, struct xdg_surface *surface, uint32_t serial) {
+    auto *window = reinterpret_cast<cv_wl_window *>(data);
+
+    xdg_surface_ack_configure(surface, serial);
+
+    if (window->wait_for_configure_) {
+        window->wait_for_configure_ = false;
+        if (window->pending_.repaint_request)
+            window->show();
+    }
+}
+
+void cv_wl_window::handle_toplevel_configure(
+        void *data, struct xdg_toplevel *toplevel,
+        int32_t width, int32_t height, struct wl_array *states) {
+    CV_UNUSED(toplevel);
+    cv::Size size = cv::Size(width, height);
+    auto *window = reinterpret_cast<cv_wl_window *>(data);
+
+    auto old_state = window->state_;
+    window->state_.reset();
+
+    const uint32_t *state;
+    WL_ARRAY_FOR_EACH(state, states, const uint32_t*) {
+        switch (*state) {
+            case XDG_TOPLEVEL_STATE_MAXIMIZED:
+                window->state_.maximized = true;
+                if (!old_state.maximized) {
+                    window->state_.prev_size_ = window->size_;
+                    window->show(size);
+                }
+                break;
+            case XDG_TOPLEVEL_STATE_FULLSCREEN:
+                window->state_.fullscreen = true;
+                break;
+            case XDG_TOPLEVEL_STATE_RESIZING:
+                window->state_.resizing = true;
+                if (!size.empty())
+                    window->show(size);
+                break;
+            case XDG_TOPLEVEL_STATE_ACTIVATED:
+                window->state_.focused = true;
+                break;
+            default:
+                /* Unknown state */
+                break;
+        }
+    }
+
+    /* When unmaximized, resize to the previous size */
+    if (old_state.maximized && !window->state_.maximized)
+        window->show(old_state.prev_size_);
+
+#ifndef NDEBUG
+    std::cerr << "[*] DEBUG: " << __func__
+              << ": maximized=" << window->state_.maximized
+              << " fullscreen=" << window->state_.fullscreen
+              << " resizing=" << window->state_.resizing
+              << " focused=" << window->state_.focused
+              << " size=" << size << std::endl;
+#endif
+}
+
+void cv_wl_window::handle_toplevel_close(void *data, struct xdg_toplevel *surface) {
+    CV_UNUSED(data);
+    CV_UNUSED(surface);
+    //auto *window = reinterpret_cast<cv_wl_window *>(data);
+}
+
+
+/*
+ * cv_wl_core implementation
+ */
+cv_wl_core::cv_wl_core()
+= default;
+
+cv_wl_core::~cv_wl_core() {
+    this->destroy_all_windows();
+    display_.reset();
+}
+
+void cv_wl_core::init() {
+    display_ = std::make_shared<cv_wl_display>();
+    if (!display_)
+        CV_Error(StsNoMem, "Could not create display");
+}
+
+cv_wl_display &cv_wl_core::display() {
+    CV_Assert(display_);
+    return *display_;
+}
+
+std::vector<std::string> cv_wl_core::get_window_names() const {
+    std::vector<std::string> names;
+    for (auto &&e: windows_)
+        names.emplace_back(e.first);
+    return names;
+}
+
+shared_ptr<cv_wl_window> cv_wl_core::get_window(std::string const &name) {
+    return windows_.count(name) >= 1 ?
+           windows_.at(name) : std::shared_ptr<cv_wl_window>();
+}
+
+void *cv_wl_core::get_window_handle(std::string const &name) {
+    auto window = get_window(name);
+    return window ? get_window(name).get() : nullptr;
+}
+
+std::string const &cv_wl_core::get_window_name(void *handle) {
+    return handles_[handle];
+}
+
+bool cv_wl_core::create_window(std::string const &name, int flags) {
+    auto window = std::make_shared<cv_wl_window>(display_, name, flags);
+    auto result = windows_.insert(std::make_pair(name, window));
+    handles_[window.get()] = window->get_title();
+    return result.second;
+}
+
+bool cv_wl_core::destroy_window(std::string const &name) {
+    return windows_.erase(name);
+}
+
+void cv_wl_core::destroy_all_windows() {
+    return windows_.clear();
+}
+
+
+/*                              */
+/*  OpenCV highgui interfaces   */
+/*                              */
+
+/* Global wayland core object */
+class CvWlCore {
+public:
+    CvWlCore(CvWlCore &other) = delete;
+
+    void operator=(const CvWlCore &) = delete;
+
+    static cv_wl_core &getInstance() {
+        if (!sInstance) {
+            sInstance = std::make_shared<cv_wl_core>();
+            sInstance->init();
+        }
+        return *sInstance;
+    }
+
+protected:
+    static std::shared_ptr<cv_wl_core> sInstance;
+};
+
+std::shared_ptr<cv_wl_core> CvWlCore::sInstance = nullptr;
+
+CV_IMPL int cvStartWindowThread() {
+    return 0;
+}
+
+CV_IMPL int cvNamedWindow(const char *name, int flags) {
+    return CvWlCore::getInstance().create_window(name, flags);
+}
+
+CV_IMPL void cvDestroyWindow(const char *name) {
+    CvWlCore::getInstance().destroy_window(name);
+}
+
+CV_IMPL void cvDestroyAllWindows() {
+    CvWlCore::getInstance().destroy_all_windows();
+}
+
+CV_IMPL void *cvGetWindowHandle(const char *name) {
+    return CvWlCore::getInstance().get_window_handle(name);
+}
+
+CV_IMPL const char *cvGetWindowName(void *window_handle) {
+    return CvWlCore::getInstance().get_window_name(window_handle).c_str();
+}
+
+CV_IMPL void cvMoveWindow(const char *name, int x, int y) {
+    CV_UNUSED(name);
+    CV_UNUSED(x);
+    CV_UNUSED(y);
+    CV_LOG_ONCE_WARNING(nullptr, "Function not implemented: User cannot move window surfaces in Wayland");
+}
+
+CV_IMPL void cvResizeWindow(const char *name, int width, int height) {
+    if (auto window = CvWlCore::getInstance().get_window(name))
+        window->show(cv::Size(width, height));
+    else
+        throw_system_error("Could not get window name", errno)
+}
+
+CV_IMPL int cvCreateTrackbar(const char *name_bar, const char *window_name, int *value, int count,
+                             CvTrackbarCallback on_change) {
+    CV_UNUSED(name_bar);
+    CV_UNUSED(window_name);
+    CV_UNUSED(value);
+    CV_UNUSED(count);
+    CV_UNUSED(on_change);
+    CV_LOG_ONCE_WARNING(nullptr, "Not implemented, use cvCreateTrackbar2");
+
+    return 0;
+}
+
+CV_IMPL int cvCreateTrackbar2(const char *trackbar_name, const char *window_name, int *val, int count,
+                              CvTrackbarCallback2 on_notify, void *userdata) {
+    if (auto window = CvWlCore::getInstance().get_window(window_name))
+        window->create_trackbar(trackbar_name, val, count, on_notify, userdata);
+
+    return 0;
+}
+
+CV_IMPL int cvGetTrackbarPos(const char *trackbar_name, const char *window_name) {
+    if (auto window = CvWlCore::getInstance().get_window(window_name)) {
+        auto trackbar_ptr = window->get_trackbar(trackbar_name);
+        if (auto trackbar = trackbar_ptr.lock())
+            return trackbar->get_pos();
+    }
+
+    return -1;
+}
+
+CV_IMPL void cvSetTrackbarPos(const char *trackbar_name, const char *window_name, int pos) {
+    if (auto window = CvWlCore::getInstance().get_window(window_name)) {
+        auto trackbar_ptr = window->get_trackbar(trackbar_name);
+        if (auto trackbar = trackbar_ptr.lock())
+            trackbar->set_pos(pos);
+    }
+}
+
+CV_IMPL void cvSetTrackbarMax(const char *trackbar_name, const char *window_name, int maxval) {
+    if (auto window = CvWlCore::getInstance().get_window(window_name)) {
+        auto trackbar_ptr = window->get_trackbar(trackbar_name);
+        if (auto trackbar = trackbar_ptr.lock())
+            trackbar->set_max(maxval);
+    }
+}
+
+CV_IMPL void cvSetTrackbarMin(const char *trackbar_name, const char *window_name, int minval) {
+    CV_UNUSED(trackbar_name);
+    CV_UNUSED(window_name);
+    CV_UNUSED(minval);
+}
+
+CV_IMPL void cvSetMouseCallback(const char *window_name, CvMouseCallback on_mouse, void *param) {
+    if (auto window = CvWlCore::getInstance().get_window(window_name))
+        window->set_mouse_callback(on_mouse, param);
+}
+
+CV_IMPL void cvShowImage(const char *name, const CvArr *arr) {
+    auto cv_core = CvWlCore::getInstance();
+    auto window = cv_core.get_window(name);
+    if (!window) {
+        cv_core.create_window(name, cv::WINDOW_AUTOSIZE);
+        if (!(window = cv_core.get_window(name)))
+            CV_Error_(StsNoMem, ("Failed to create window: %s", name));
+    }
+
+    cv::Mat mat = cv::cvarrToMat(arr, true);
+    window->show_image(mat);
+}
+
+void setWindowTitle_WAYLAND(const cv::String &winname, const cv::String &title) {
+    if (auto window = CvWlCore::getInstance().get_window(winname))
+        window->set_title(title);
+}
+
+CV_IMPL int cvWaitKey(int delay) {
+    int key = -1;
+    auto limit = ch::duration_cast<ch::nanoseconds>(ch::milliseconds(delay));
+    auto start_time = ch::duration_cast<ch::nanoseconds>(
+            ch::steady_clock::now().time_since_epoch())
+            .count();
+
+    while (true) {
+
+        auto res = CvWlCore::getInstance().display().run_once();
+        if (res > 0) {
+            auto &&key_queue =
+                    CvWlCore::getInstance().display().input().lock()
+                            ->keyboard().lock()->get_key_queue();
+            if (!key_queue.empty()) {
+                key = key_queue.back();
+                break;
+            }
+        }
+
+        auto end_time = ch::duration_cast<ch::nanoseconds>(
+                ch::steady_clock::now().time_since_epoch())
+                .count();
+
+        auto elapsed = end_time - start_time;
+        if (limit.count() > 0 && elapsed >= limit.count()) {
+            break;
+        }
+
+        auto sleep_time = 64000 - elapsed;
+        if (sleep_time > 0) {
+            std::this_thread::sleep_for(ch::nanoseconds(sleep_time));
+        }
+    }
+
+    return key;
+}
+
+#ifdef HAVE_OPENGL
+CV_IMPL void cvSetOpenGlDrawCallback(const char *, CvOpenGlDrawCallback, void *) {
+}
+
+CV_IMPL void cvSetOpenGlContext(const char *) {
+}
+
+CV_IMPL void cvUpdateWindow(const char *) {
+}
+#endif // HAVE_OPENGL
+
+#endif // HAVE_WAYLAND
+#endif // _WIN32
diff --git a/modules/highgui/test/test_gui.cpp b/modules/highgui/test/test_gui.cpp
index 6bf634b500..de40e80ede 100644
--- a/modules/highgui/test/test_gui.cpp
+++ b/modules/highgui/test/test_gui.cpp
@@ -58,6 +58,7 @@ inline void verify_size(const std::string &nm, const cv::Mat &img)
         && !defined HAVE_QT \
         && !defined HAVE_WIN32UI \
         && !defined HAVE_COCOA \
+        && !defined HAVE_WAYLAND \
     )
 TEST(Highgui_GUI, DISABLED_regression)
 #else
@@ -135,6 +136,7 @@ static void Foo(int, void* counter)
         && !defined HAVE_GTK \
         && !defined HAVE_QT \
         && !defined HAVE_WIN32UI \
+        && !defined HAVE_WAYLAND \
     ) \
     || defined(__APPLE__)  // test fails on Mac (cocoa)
 TEST(Highgui_GUI, DISABLED_trackbar_unsafe)
@@ -174,6 +176,7 @@ void testTrackbarCallback(int pos, void* param)
         && !defined HAVE_GTK \
         && !defined HAVE_QT \
         && !defined HAVE_WIN32UI \
+        && !defined HAVE_WAYLAND \
     ) \
     || defined(__APPLE__)  // test fails on Mac (cocoa)
 TEST(Highgui_GUI, DISABLED_trackbar)

From 300b57dd70030217560ad616caae412ed766cac2 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Mon, 27 Jun 2022 21:34:15 +0300
Subject: [PATCH 095/178] Workflow for labeled iOS PRs in 3.4 branch

---
 .github/workflows/PR-3.4.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index 1e3282310e..524c835b00 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -19,4 +19,7 @@ jobs:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-ARM64.yaml@main
 
   macOS-X64:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-x86_64.yaml@main
\ No newline at end of file
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-x86_64.yaml@main
+
+  iOS:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-iOS.yaml@main

From 2336b0706d10b54866abdc4470557ef1c966ec2e Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Wed, 29 Jun 2022 20:35:27 +0900
Subject: [PATCH 096/178] add support for Orin GPU

---
 cmake/OpenCVDetectCUDA.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index ac29e600d3..4163c51a22 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -243,12 +243,13 @@ if(CUDA_FOUND)
       endif()
       if(NOT _nvcc_res EQUAL 0)
         message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
-        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0)
+        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0) Orin (8.7)
         ocv_filter_available_architecture(__cuda_arch_bin
             5.3
             6.2
             7.2
             7.0
+            8.7
         )
       else()
         set(__cuda_arch_bin "${_nvcc_out}")

From b152b8cbcdc048b290d08c2a1a201fc0f3ae1e6c Mon Sep 17 00:00:00 2001
From: Lucas Yang <yangchenshin77@gmail.com>
Date: Thu, 30 Jun 2022 04:07:43 +0800
Subject: [PATCH 097/178] Fix missing CharVector for JavaScript bindings

---
 modules/js/src/core_bindings.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp
index a43fb726de..9df700c5fb 100644
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -419,6 +419,7 @@ namespace binding_utils
 EMSCRIPTEN_BINDINGS(binding_utils)
 {
     register_vector<int>("IntVector");
+    register_vector<char>("CharVector");
     register_vector<float>("FloatVector");
     register_vector<double>("DoubleVector");
     register_vector<cv::Point>("PointVector");

From a311d1bdc078d21289c898fc44527c6f9991a07e Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 30 Jun 2022 11:22:34 +0300
Subject: [PATCH 098/178] Workflow for labeled iOS PRs in 4.x branch

---
 .github/workflows/PR-4.x.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index dc44e501a6..5ea1dfdb9e 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -21,5 +21,8 @@ jobs:
   macOS-X64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-macOS-x86_64.yaml@main
 
+  iOS:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-iOS.yaml@main
+
   TIM-VX:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main
\ No newline at end of file
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main

From b91f173680af92498a8bd2517fc7c524c2eae640 Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Thu, 30 Jun 2022 23:31:52 +0300
Subject: [PATCH 099/178] Merge pull request #22147 from rogday:zoom_factor

Add zoom factor to interactive calibration tool

* add zoom factor

* address review comments
---
 apps/interactive-calibration/calibCommon.hpp        |  1 +
 apps/interactive-calibration/calibPipeline.cpp      | 13 +++++++++++--
 apps/interactive-calibration/frameProcessor.cpp     | 12 +++++++++++-
 apps/interactive-calibration/frameProcessor.hpp     |  1 +
 apps/interactive-calibration/main.cpp               |  1 +
 .../parametersController.cpp                        |  1 +
 6 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/apps/interactive-calibration/calibCommon.hpp b/apps/interactive-calibration/calibCommon.hpp
index 617c17dbcf..f3fffa1729 100644
--- a/apps/interactive-calibration/calibCommon.hpp
+++ b/apps/interactive-calibration/calibCommon.hpp
@@ -94,6 +94,7 @@ namespace calib
         int maxFramesNum;
         int minFramesNum;
         bool saveFrames;
+        float zoom;
 
         captureParameters()
         {
diff --git a/apps/interactive-calibration/calibPipeline.cpp b/apps/interactive-calibration/calibPipeline.cpp
index a92dbffc2f..3b0eedffec 100644
--- a/apps/interactive-calibration/calibPipeline.cpp
+++ b/apps/interactive-calibration/calibPipeline.cpp
@@ -5,6 +5,7 @@
 #include "calibPipeline.hpp"
 
 #include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
 
 #include <stdexcept>
 
@@ -58,7 +59,7 @@ PipelineExitStatus CalibPipeline::start(std::vector<cv::Ptr<FrameProcessor> > pr
     if(!mCapture.isOpened())
         throw std::runtime_error("Unable to open video source");
 
-    cv::Mat frame, processedFrame;
+    cv::Mat frame, processedFrame, resizedFrame;
     while(mCapture.grab()) {
         mCapture.retrieve(frame);
         if(mCaptureParams.flipVertical)
@@ -67,7 +68,15 @@ PipelineExitStatus CalibPipeline::start(std::vector<cv::Ptr<FrameProcessor> > pr
         frame.copyTo(processedFrame);
         for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
             processedFrame = (*it)->processFrame(processedFrame);
-        cv::imshow(mainWindowName, processedFrame);
+        if (std::fabs(mCaptureParams.zoom - 1.) > 0.001f)
+        {
+            cv::resize(processedFrame, resizedFrame, cv::Size(), mCaptureParams.zoom, mCaptureParams.zoom);
+        }
+        else
+        {
+            resizedFrame = std::move(processedFrame);
+        }
+        cv::imshow(mainWindowName, resizedFrame);
         char key = (char)cv::waitKey(CAP_DELAY);
 
         if(key == 27) // esc
diff --git a/apps/interactive-calibration/frameProcessor.cpp b/apps/interactive-calibration/frameProcessor.cpp
index 2c2a1cd5aa..aa164f2a43 100644
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@@ -201,7 +201,16 @@ void CalibProcessor::showCaptureMessage(const cv::Mat& frame, const std::string
     double textSize = VIDEO_TEXT_SIZE * frame.cols / (double) IMAGE_MAX_WIDTH;
     cv::bitwise_not(frame, frame);
     cv::putText(frame, message, textOrigin, 1, textSize, cv::Scalar(0,0,255), 2, cv::LINE_AA);
-    cv::imshow(mainWindowName, frame);
+    cv::Mat resized;
+    if (std::fabs(mZoom - 1.) > 0.001f)
+    {
+        cv::resize(frame, resized, cv::Size(), mZoom, mZoom);
+    }
+    else
+    {
+        resized = frame;
+    }
+    cv::imshow(mainWindowName, resized);
     cv::waitKey(300);
 }
 
@@ -267,6 +276,7 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
     mSquareSize = capParams.squareSize;
     mTemplDist = capParams.templDst;
     mSaveFrames = capParams.saveFrames;
+    mZoom = capParams.zoom;
 
     switch(mBoardType)
     {
diff --git a/apps/interactive-calibration/frameProcessor.hpp b/apps/interactive-calibration/frameProcessor.hpp
index 88e87f7b98..e048f84320 100644
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@@ -51,6 +51,7 @@ protected:
     float mSquareSize;
     float mTemplDist;
     bool mSaveFrames;
+    float mZoom;
 
     bool detectAndParseChessboard(const cv::Mat& frame);
     bool detectAndParseChAruco(const cv::Mat& frame);
diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index b5c3642bb6..c2a6aa7298 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -41,6 +41,7 @@ const std::string keys  =
         "{d        | 0.8     | Min delay between captures}"
         "{pf       | defaultConfig.xml| Advanced application parameters}"
         "{save_frames | false   | Save frames that contribute to final calibration}"
+        "{zoom     | 1       | Zoom factor applied to the image}"
         "{help     |         | Print help}";
 
 bool calib::showOverlayMessage(const std::string& message)
diff --git a/apps/interactive-calibration/parametersController.cpp b/apps/interactive-calibration/parametersController.cpp
index 5659b0e469..b5a01b1ea8 100644
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@@ -90,6 +90,7 @@ bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
     mCapParams.squareSize = parser.get<float>("sz");
     mCapParams.templDst = parser.get<float>("dst");
     mCapParams.saveFrames = parser.has("save_frames");
+    mCapParams.zoom = parser.get<float>("zoom");
 
     if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))
         return false;

From 0f067fd0a683959a267db6b8cc297deb54fdee00 Mon Sep 17 00:00:00 2001
From: Suleyman TURKMEN <sturkmen@hotmail.com>
Date: Sun, 26 Jun 2022 11:48:43 +0300
Subject: [PATCH 100/178] make imgcodecs optional for highgui

---
 modules/highgui/CMakeLists.txt              | 4 ++--
 modules/highgui/src/plugin_wrapper.impl.hpp | 5 ++++-
 modules/highgui/src/precomp.hpp             | 2 --
 modules/highgui/src/window_w32.cpp          | 6 ++++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 65d24e0ab0..869bcca492 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(the_description "High-level GUI")
 
 if(ANDROID)
-  ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python)
+  ocv_add_module(highgui opencv_imgproc OPTIONAL opencv_imgcodecs opencv_videoio WRAP python)
 else()
-  ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python java)
+  ocv_add_module(highgui opencv_imgproc OPTIONAL opencv_imgcodecs opencv_videoio WRAP python java)
 endif()
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/plugin.cmake)
diff --git a/modules/highgui/src/plugin_wrapper.impl.hpp b/modules/highgui/src/plugin_wrapper.impl.hpp
index 23f0ecf6bf..042cd0f1fa 100644
--- a/modules/highgui/src/plugin_wrapper.impl.hpp
+++ b/modules/highgui/src/plugin_wrapper.impl.hpp
@@ -235,12 +235,15 @@ std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
     return results;
 }
 
-// NB: require loading of imgcodecs module
+#ifdef HAVE_OPENCV_IMGCODECS // NB: require loading of imgcodecs module
 static void* g_imwrite = (void*)imwrite;
+#endif
 
 void PluginUIBackendFactory::loadPlugin()
 {
+#ifdef HAVE_OPENCV_IMGCODECS
     CV_Assert(g_imwrite);
+#endif
     for (const FileSystemPath_t& plugin : getPluginCandidates(baseName_))
     {
         auto lib = std::make_shared<cv::plugin::impl::DynamicLib>(plugin);
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 0d26b957ad..c43033cbb2 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -60,8 +60,6 @@
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/highgui/highgui_c.h"
 
-#include "opencv2/imgcodecs.hpp"
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 76f320f19f..42b0892c1b 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -2132,6 +2132,7 @@ cvDestroyAllWindows(void)
 
 static void showSaveDialog(CvWindow& window)
 {
+#ifdef HAVE_OPENCV_IMGCODECS
     if (!window.image)
         return;
 
@@ -2193,6 +2194,11 @@ static void showSaveDialog(CvWindow& window)
         cv::flip(cv::Mat(sz.cy, sz.cx, CV_8UC(channels), data, (sz.cx * channels + 3) & -4), tmp, 0);
         cv::imwrite(szFileName, tmp);
     }
+#else
+    CV_UNUSED(window);
+    CV_LOG_WARNING("Save dialog requires enabled 'imgcodecs' module.");
+    return;
+#endif
 }
 
 /*

From 59b870a87ad22d89027e28eee3b8be1f3db1c0bf Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Fri, 1 Jul 2022 18:03:15 +0800
Subject: [PATCH 101/178] Merge pull request #21910 from zihaomu:fast_conv_ARM

DNN: Accelerating convolution

* Fast Conv of ARM, X86 and universal intrinsics.

* improve code style.

* error fixed.

* improve the License

* optimize memory allocated and Adjust the threshold.

* change FasterRCNN_vgg16 to 2GB memory.
---
 modules/dnn/src/layers/convolution_layer.cpp  |   54 +-
 .../depthwise_convolution.cpp                 |  385 +++++
 .../fast_convolution.avx2.cpp                 |  361 +++++
 .../fast_convolution/fast_convolution.cpp     |  694 +++++++++
 .../fast_convolution/fast_convolution.hpp     |   89 ++
 .../fast_convolution.simd.hpp                 |  342 +++++
 .../fast_convolution/winograd_3x3s1_f63.cpp   | 1351 +++++++++++++++++
 modules/dnn/test/test_backends.cpp            |    2 +-
 modules/dnn/test/test_caffe_importer.cpp      |    4 +-
 modules/dnn/test/test_int8_layers.cpp         |   14 +-
 modules/dnn/test/test_tf_importer.cpp         |    2 +-
 11 files changed, 3286 insertions(+), 12 deletions(-)
 create mode 100644 modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp

diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 0bf39f93b3..124443399b 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -71,6 +71,8 @@ using namespace cv::dnn::ocl4dnn;
 using namespace cv::dnn::cuda4dnn;
 #endif
 
+#include "fast_convolution/fast_convolution.hpp"
+
 namespace cv
 {
 namespace dnn
@@ -253,11 +255,14 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 {
 public:
     enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
-    Mat weightsMat;
+    Mat weightsMat;  // Used to store weight params. It will be used for layer fusion and memory alignment.
     std::vector<float> biasvec;
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
 
+    Mat fastWeights; // Used to store weight params. It will be used for layer fusion and without memory alignment.
+    Ptr<FastConv2d> fastConv2dImpl;
+
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
     std::vector<UMat> umat_blobs;
@@ -433,6 +438,7 @@ public:
                 wm.copyTo(wm_aligned);
                 wm = wm_aligned;
             }
+            fastWeights = blobs[0].reshape(1, numOutput);
             weightsMat = wm;
         }
         else
@@ -628,14 +634,26 @@ public:
             if (weightsMat.data == blobs[0].data)
                 weightsMat = weightsMat.clone();
 
+            // If fastWeights is the same as weightsMat, we don't need to allocate more space for fastWeights.
+            bool sameFastWeights = false;
+            if (fastWeights.step1() == weightsMat.step1()) // If weightsMat is realigned, it is not the same as fastWeights.
+                sameFastWeights = true;
+
+            if (!sameFastWeights && fastWeights.data == blobs[0].data)
+                fastWeights = fastWeights.clone();
+
             Mat originWeights = blobs[0].reshape(1, outCn);
             for (int i = 0; i < outCn; ++i)
             {
                 double wi = w.at<float>(i);
                 weightsMultipliers[i] *= wi;
                 cv::multiply(originWeights.row(i), weightsMultipliers[i], weightsMat.row(i));
+                if (!sameFastWeights)
+                    cv::multiply(originWeights.row(i), weightsMultipliers[i], fastWeights.row(i));
                 biasvec[i] *= wi;
             }
+            if (sameFastWeights)
+                fastWeights = weightsMat;
         }
 
         if (!b.empty())
@@ -1948,8 +1966,13 @@ public:
 
         int outCn = blobs.empty() ? inputs[1].size[0] : blobs[0].size[0];
         // Need to align non-const blobs
+        bool variableWeight = false;
         if (blobs.empty())
         {
+            variableWeight = true;
+            if (fastWeights.data != inputs[1].data)
+                fastWeights = inputs[1].clone();
+
             Mat wm = inputs[1].reshape(1, outCn);
             if (wm.data != weightsMat.data)
             {
@@ -2066,8 +2089,37 @@ public:
         {
             int nstripes = std::max(getNumThreads(), 1);
 
+            // Initialization of FastCovn2d
+            if ((!fastConv2dImpl || variableWeight) && inputs[0].dims == 4)
+            {
+                int K = outputs[0].size[1];
+                int C = inputs[0].size[1];
+                int Hk = kernel_size[kernel_size.size() - 2];
+                int Wk = kernel_size.back();
+
+                CV_Assert(outputs[0].size[1] % ngroups == 0);
+                int stride_h = strides[strides.size() - 2];
+                int stride_w = strides.back();
+
+                int dilation_h = dilations[dilations.size() - 2];
+                int dilation_w = dilations.back();
+                float* weightsPtr = fastWeights.ptr<float>();
+                CV_Assert(weightsPtr);
+
+                fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h,
+                                              dilation_w, dilation_h, pads_begin, pads_end, weightsPtr, &biasvec[0]);
+            }
+
+            if (fastConv2dImpl)
+            {
+                runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ);
+                return;
+            }
+
+            // Use only for Conv1D and Conv3D.
             ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,
                             kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);
+
         }
     }
 
diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
new file mode 100644
index 0000000000..c98c3d6549
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -0,0 +1,385 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv { namespace dnn {
+
+static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                           float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                           int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                           int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
+{
+#ifdef CV_SIMD128
+    v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
+
+    v_float32x4 w0 = v_setall_f32(
+            0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
+    if (useSIMD)
+    {
+        vbias = v_setall_f32(biasval);
+        if (is3x3)
+        {
+            w0 = v_setall_f32(weights[0]);
+            w1 = v_setall_f32(weights[1]);
+            w2 = v_setall_f32(weights[2]);
+            w3 = v_setall_f32(weights[3]);
+            w4 = v_setall_f32(weights[4]);
+            w5 = v_setall_f32(weights[5]);
+            w6 = v_setall_f32(weights[6]);
+            w7 = v_setall_f32(weights[7]);
+            w8 = v_setall_f32(weights[8]);
+        }
+    }
+#endif
+    int dy0 = 1;
+    for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
+    {
+#ifdef CV_SIMD128
+        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
+              ? 3 : 1;
+#endif
+        int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
+        int yi_ = y0 * stride_y - pad_top;
+
+        for (;;)
+        {
+            float s_0, s_1, s_2;
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+
+                        if ((unsigned) xi < (unsigned) Wi)
+                        {
+                            s_0 += inptr[yi * Wi + xi] * w;
+                            s_1 += inptr[(yi + 1) * Wi + xi] * w;
+                            s_2 += inptr[(yi + 2) * Wi + xi] * w;
+                        }
+                    }
+                    s_0 = std::min(std::max(s_0, minval), maxval);
+                    s_1 = std::min(std::max(s_1, minval), maxval);
+                    s_2 = std::min(std::max(s_2, minval), maxval);
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+                        if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
+                            s_0 += inptr[yi * Wi + xi] * w;
+                    }
+                    s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            if (x0 == W0)
+                break;
+            x1 = inner_xright;
+#ifdef CV_SIMD128
+            if (useSIMD)
+            {
+                if (is3x3)
+                {
+                    if (dy0 == 3)
+                    {
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+
+                            v_float32x4 s0, s1, s2;
+                            v_float32x4 x00 = v_load(inptr_xi);
+                            v_float32x4 x01 = v_load(inptr_xi + 1);
+                            v_float32x4 x02 = v_load(inptr_xi + 2);
+
+                            v_float32x4 x10 = v_load(inptr_xi + Wi);
+                            v_float32x4 x11 = v_load(inptr_xi + Wi + 1);
+                            v_float32x4 x12 = v_load(inptr_xi + Wi + 2);
+
+                            v_float32x4 x20 = v_load(inptr_xi + Wi * 2);
+                            v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1);
+                            v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2);
+
+                            v_float32x4 x30 = v_load(inptr_xi + Wi * 3);
+                            v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1);
+                            v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2);
+
+                            v_float32x4 x40 = v_load(inptr_xi + Wi * 4);
+                            v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1);
+                            v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2);
+
+                            s0 = v_fma(x00, w0, vbias);
+                            s1 = v_fma(x10, w0, vbias);
+                            s2 = v_fma(x20, w0, vbias);
+
+                            s0 = v_fma(x01, w1, s0);
+                            s1 = v_fma(x11, w1, s1);
+                            s2 = v_fma(x21, w1, s2);
+
+                            s0 = v_fma(x02, w2, s0);
+                            s1 = v_fma(x12, w2, s1);
+                            s2 = v_fma(x22, w2, s2);
+
+                            s0 = v_fma(x10, w3, s0);
+                            s1 = v_fma(x20, w3, s1);
+                            s2 = v_fma(x30, w3, s2);
+
+                            s0 = v_fma(x11, w4, s0);
+                            s1 = v_fma(x21, w4, s1);
+                            s2 = v_fma(x31, w4, s2);
+
+                            s0 = v_fma(x12, w5, s0);
+                            s1 = v_fma(x22, w5, s1);
+                            s2 = v_fma(x32, w5, s2);
+
+                            s0 = v_fma(x20, w6, s0);
+                            s1 = v_fma(x30, w6, s1);
+                            s2 = v_fma(x40, w6, s2);
+
+                            s0 = v_fma(x21, w7, s0);
+                            s1 = v_fma(x31, w7, s1);
+                            s2 = v_fma(x41, w7, s2);
+
+                            s0 = v_fma(x22, w8, s0);
+                            s1 = v_fma(x32, w8, s1);
+                            s2 = v_fma(x42, w8, s2);
+
+                            if (ifMinMaxAct)
+                            {
+                                s0 = v_min(v_max(s0, vminval), vmaxval);
+                                s1 = v_min(v_max(s1, vminval), vmaxval);
+                                s2 = v_min(v_max(s2, vminval), vmaxval);
+                            }
+
+                            v_store(outptr + x0, s0);
+                            v_store(outptr + W0 + x0, s1);
+                            v_store(outptr + W0 * 2 + x0, s2);
+                        }
+                    }
+                    else
+                    {
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                            v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
+                            v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
+                            v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2;
+
+                            s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0);
+                            s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1);
+                            s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2);
+
+                            s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0);
+                            s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1);
+                            s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2);
+
+                            s0 = s0 + s1 + s2;
+                            if (ifMinMaxAct)
+                                s0 = v_min(v_max(s0, vminval), vmaxval);
+                            v_store(outptr + x0, s0);
+                        }
+                    }
+                }
+                else
+                {
+                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    {
+                        int xi_ = x0 * stride_x - pad_left, k = 0;
+                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                        v_float32x4 s0 = vbias;
+                        for (; k <= ksize - 4; k += 4)
+                        {
+                            v_float32x4 v0 = v_load(inptr_xi + ofstab[k]);
+                            v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]);
+                            v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]);
+                            v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]);
+
+                            v_float32x4 ww0 = v_setall_f32(weights[k]);
+                            v_float32x4 ww1 = v_setall_f32(weights[k+1]);
+                            v_float32x4 ww2 = v_setall_f32(weights[k+2]);
+                            v_float32x4 ww3 = v_setall_f32(weights[k+3]);
+
+                            s0 = v_fma(v0, ww0, s0);
+                            s0 = v_fma(v1, ww1, s0);
+                            s0 = v_fma(v2, ww2, s0);
+                            s0 = v_fma(v3, ww3, s0);
+                        }
+                        for (; k < ksize; k++)
+                            s0 = v_fma(v_load(inptr_xi + ofstab[k]),
+                                       v_setall_f32(weights[k]), s0);
+                        if (ifMinMaxAct)
+                            s0 = v_min(v_max(s0, vminval), vmaxval);
+                        v_store(outptr + x0, s0);
+                    }
+                }
+            }
+#endif
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + W0 * yi_ + xi_;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int inp_ofs = ofstab[k];
+                        float w = weights[k];
+                        s_0 += inptr_xi[inp_ofs] * w;
+                        s_1 += inptr_xi[inp_ofs + Wi] * w;
+                        s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        s_0 += inptr_xi[ofstab[k]] * weights[k];
+                    }
+
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            x1 = W0;
+        }
+    }
+}
+
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
+    int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;
+
+    const size_t inp_planesize = (size_t) Hi * Wi;
+    const size_t out_planesize = (size_t) H0 * W0;
+
+    CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
+
+    int stride_y = conv->stride_y, stride_x = conv->stride_x;
+    int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
+
+    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
+    int pad_left = conv->pad_left, pad_right = conv->pad_right;
+
+    int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+
+    const float *inp = input.ptr<float>();
+    float *out = output.ptr<float>();
+
+    std::vector<int> ofstab_(3 * padded_ksize, 0);
+    int *ofstab = ofstab_.data();
+    int *yxtab = ofstab + padded_ksize;
+
+    for (int k = 0; k < padded_ksize; k++)
+    {
+        int y = k < ksize ? k / Wk : 0;
+        int x = k < ksize ? k % Wk : 0;
+        int dy = y * dilation_y, dx = x * dilation_x;
+        yxtab[k * 2] = dy;
+        yxtab[k * 2 + 1] = dx;
+        ofstab[k] = dy * Wi + dx;
+    }
+
+    const float *weights0 = conv->weightsBuf.data(), *bias = conv->biasBuf.data();
+    int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
+    int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
+
+    CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
+
+    inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
+    inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
+    inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
+    inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
+
+    if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
+    {
+        inner_xleft = W0;
+        inner_ytop = H0;
+    }
+
+    inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;
+
+    bool useSIMD = stride_x == 1 && inner_xleft < W0;
+    bool is3x3 = Hk == 3 && Wk == 3;
+
+    parallel_for_(Range(0, N * C), [&](const Range &r0) {
+        for (int nc = r0.start; nc < r0.end; nc++)
+        {
+            int c = nc % C;
+            const float *inptr = inp + inp_planesize * nc;
+            float *outptr0 = out + out_planesize * nc;
+
+            float biasval = bias[c];
+            const float *weights = weights0 + c * padded_ksize;
+
+#if CV_TRY_AVX2
+            if (conv->useAVX2)
+                opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                                         pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
+                                         inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+            else
+#endif
+            depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                           pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
+                           inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+
+            if (activ)
+                activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
+        }
+    });
+}
+
+}} // namespace cv::dnn
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
new file mode 100644
index 0000000000..22580c580c
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
@@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv {
+namespace opt_AVX2
+{
+#if CV_TRY_AVX2
+void convBlock_AVX2(int k, const float *a, const float *b,
+                float *c, int ldc, const float *bias,
+                float minval, float maxval, bool ifActiv)
+{
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
+    __m256 vminval = _mm256_set1_ps(minval), vmaxval = _mm256_set1_ps(maxval);
+    __m256 c0 = _mm256_set1_ps(bias[0]), c1 = c0, c2 = c0;
+    __m256 c3 = _mm256_set1_ps(bias[1]), c4 = c3, c5 = c3;
+    __m256 c6 = _mm256_set1_ps(bias[2]), c7 = c6, c8 = c6;
+    __m256 c9 = _mm256_set1_ps(bias[3]), c10 = c9, c11 = c9;
+
+    __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
+    __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
+
+    for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+    {
+        a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+        b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
+
+        c0 = _mm256_fmadd_ps(b0, a0, c0);
+        c1 = _mm256_fmadd_ps(b1, a0, c1);
+        c2 = _mm256_fmadd_ps(b2, a0, c2);
+
+        c3 = _mm256_fmadd_ps(b0, a1, c3);
+        a0 = _mm256_set1_ps(a[2]);
+        c4 = _mm256_fmadd_ps(b1, a1, c4);
+        c5 = _mm256_fmadd_ps(b2, a1, c5);
+
+        c6 = _mm256_fmadd_ps(b0, a0, c6);
+        a1 = _mm256_set1_ps(a[3]);
+        c7 = _mm256_fmadd_ps(b1, a0, c7);
+        c8 = _mm256_fmadd_ps(b2, a0, c8);
+
+        c9 = _mm256_fmadd_ps(b0, a1, c9);
+        c10 = _mm256_fmadd_ps(b1, a1, c10);
+        c11 = _mm256_fmadd_ps(b2, a1, c11);
+    }
+
+    if (ifActiv)
+    {
+        c0 = _mm256_min_ps(_mm256_max_ps(c0, vminval), vmaxval);
+        c1 = _mm256_min_ps(_mm256_max_ps(c1, vminval), vmaxval);
+        c2 = _mm256_min_ps(_mm256_max_ps(c2, vminval), vmaxval);
+        c3 = _mm256_min_ps(_mm256_max_ps(c3, vminval), vmaxval);
+        c4 = _mm256_min_ps(_mm256_max_ps(c4, vminval), vmaxval);
+        c5 = _mm256_min_ps(_mm256_max_ps(c5, vminval), vmaxval);
+        c6 = _mm256_min_ps(_mm256_max_ps(c6, vminval), vmaxval);
+        c7 = _mm256_min_ps(_mm256_max_ps(c7, vminval), vmaxval);
+        c8 = _mm256_min_ps(_mm256_max_ps(c8, vminval), vmaxval);
+        c9 = _mm256_min_ps(_mm256_max_ps(c9, vminval), vmaxval);
+        c10 = _mm256_min_ps(_mm256_max_ps(c10, vminval), vmaxval);
+        c11 = _mm256_min_ps(_mm256_max_ps(c11, vminval), vmaxval);
+    }
+
+    _mm256_storeu_ps(c, c0); _mm256_storeu_ps(c+8, c1); _mm256_storeu_ps(c+16, c2);
+    _mm256_storeu_ps(c + ldc, c3); _mm256_storeu_ps(c + ldc + 8, c4); _mm256_storeu_ps(c + ldc + 16, c5);
+    _mm256_storeu_ps(c + ldc*2, c6); _mm256_storeu_ps(c + ldc*2 + 8, c7); _mm256_storeu_ps(c + ldc*2 + 16, c8);
+    _mm256_storeu_ps(c + ldc*3, c9); _mm256_storeu_ps(c + ldc*3 + 8, c10); _mm256_storeu_ps(c + ldc*3 + 16, c11);
+    _mm256_zeroupper();
+#else
+#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_AVX2."
+#endif
+}
+
+void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                    float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                    int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                    int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
+{
+    const int VECSZ = 8;
+    __m256 vminval = _mm256_set1_ps(minval);
+    __m256 vmaxval = _mm256_set1_ps(maxval);
+
+    __m256 w0 = _mm256_setzero_ps(),
+        w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
+
+    if (useSIMD)
+    {
+        vbias = _mm256_set1_ps(biasval);
+        if (is3x3)
+        {
+            w0 = _mm256_set1_ps(weights[0]);
+            w1 = _mm256_set1_ps(weights[1]);
+            w2 = _mm256_set1_ps(weights[2]);
+            w3 = _mm256_set1_ps(weights[3]);
+            w4 = _mm256_set1_ps(weights[4]);
+            w5 = _mm256_set1_ps(weights[5]);
+            w6 = _mm256_set1_ps(weights[6]);
+            w7 = _mm256_set1_ps(weights[7]);
+            w8 = _mm256_set1_ps(weights[8]);
+        }
+    }
+
+    int dy0 = 1;
+    for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
+    {
+        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
+              ? 3 : 1;
+
+        int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
+        int yi_ = y0 * stride_y - pad_top;
+
+        for (;;)
+        {
+            float s_0, s_1, s_2;
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+
+                        if ((unsigned) xi < (unsigned) Wi)
+                        {
+                            s_0 += inptr[yi * Wi + xi] * w;
+                            s_1 += inptr[(yi + 1) * Wi + xi] * w;
+                            s_2 += inptr[(yi + 2) * Wi + xi] * w;
+                        }
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+                        if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
+                            s_0 += inptr[yi * Wi + xi] * w;
+                    }
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            if (x0 == W0)
+                break;
+            x1 = inner_xright;
+
+            if (useSIMD)
+            {
+                if (is3x3)
+                {
+                    if (dy0 == 3)
+                    {
+                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+
+                            __m256 s0, s1, s2;
+                            __m256 x00 = _mm256_loadu_ps(inptr_xi);
+                            __m256 x01 = _mm256_loadu_ps(inptr_xi + 1);
+                            __m256 x02 = _mm256_loadu_ps(inptr_xi + 2);
+
+                            __m256 x10 = _mm256_loadu_ps(inptr_xi + Wi);
+                            __m256 x11 = _mm256_loadu_ps(inptr_xi + Wi + 1);
+                            __m256 x12 = _mm256_loadu_ps(inptr_xi + Wi + 2);
+
+                            __m256 x20 = _mm256_loadu_ps(inptr_xi + Wi * 2);
+                            __m256 x21 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 1);
+                            __m256 x22 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 2);
+
+                            __m256 x30 = _mm256_loadu_ps(inptr_xi + Wi * 3);
+                            __m256 x31 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 1);
+                            __m256 x32 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 2);
+
+                            __m256 x40 = _mm256_loadu_ps(inptr_xi + Wi * 4);
+                            __m256 x41 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 1);
+                            __m256 x42 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 2);
+
+                            s0 = _mm256_fmadd_ps(x00, w0, vbias);
+                            s1 = _mm256_fmadd_ps(x10, w0, vbias);
+                            s2 = _mm256_fmadd_ps(x20, w0, vbias);
+
+                            s0 = _mm256_fmadd_ps(x01, w1, s0);
+                            s1 = _mm256_fmadd_ps(x11, w1, s1);
+                            s2 = _mm256_fmadd_ps(x21, w1, s2);
+
+                            s0 = _mm256_fmadd_ps(x02, w2, s0);
+                            s1 = _mm256_fmadd_ps(x12, w2, s1);
+                            s2 = _mm256_fmadd_ps(x22, w2, s2);
+
+                            s0 = _mm256_fmadd_ps(x10, w3, s0);
+                            s1 = _mm256_fmadd_ps(x20, w3, s1);
+                            s2 = _mm256_fmadd_ps(x30, w3, s2);
+
+                            s0 = _mm256_fmadd_ps(x11, w4, s0);
+                            s1 = _mm256_fmadd_ps(x21, w4, s1);
+                            s2 = _mm256_fmadd_ps(x31, w4, s2);
+
+                            s0 = _mm256_fmadd_ps(x12, w5, s0);
+                            s1 = _mm256_fmadd_ps(x22, w5, s1);
+                            s2 = _mm256_fmadd_ps(x32, w5, s2);
+
+                            s0 = _mm256_fmadd_ps(x20, w6, s0);
+                            s1 = _mm256_fmadd_ps(x30, w6, s1);
+                            s2 = _mm256_fmadd_ps(x40, w6, s2);
+
+                            s0 = _mm256_fmadd_ps(x21, w7, s0);
+                            s1 = _mm256_fmadd_ps(x31, w7, s1);
+                            s2 = _mm256_fmadd_ps(x41, w7, s2);
+
+                            s0 = _mm256_fmadd_ps(x22, w8, s0);
+                            s1 = _mm256_fmadd_ps(x32, w8, s1);
+                            s2 = _mm256_fmadd_ps(x42, w8, s2);
+
+                            if (ifMinMaxAct)
+                            {
+                                s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                                s1 = _mm256_min_ps(_mm256_max_ps(s1, vminval), vmaxval);
+                                s2 = _mm256_min_ps(_mm256_max_ps(s2, vminval), vmaxval);
+                            }
+
+                            _mm256_storeu_ps(outptr + x0, s0);
+                            _mm256_storeu_ps(outptr + W0 + x0, s1);
+                            _mm256_storeu_ps(outptr + W0 * 2 + x0, s2);
+                        }
+                    }
+                    else
+                    {
+                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                            __m256 s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[0]), w0, vbias);
+                            __m256 s1 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[1]), w1);
+                            __m256 s2 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[2]), w2);
+
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[3]), w3, s0);
+                            s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[4]), w4, s1);
+                            s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[5]), w5, s2);
+
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[6]), w6, s0);
+                            s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[7]), w7, s1);
+                            s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[8]), w8, s2);
+
+                            s0 = _mm256_add_ps(_mm256_add_ps(s0, s1), s2);
+
+                            if (ifMinMaxAct)
+                                s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                            _mm256_storeu_ps(outptr + x0, s0);
+                        }
+                    }
+                }
+                else
+                {
+                    for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                    {
+                        int xi_ = x0 * stride_x - pad_left, k = 0;
+                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                        __m256 s0 = vbias;
+                        for (; k <= ksize - 4; k += 4)
+                        {
+                            __m256 v0 = _mm256_loadu_ps(inptr_xi + ofstab[k]);
+                            __m256 v1 = _mm256_loadu_ps(inptr_xi + ofstab[k + 1]);
+                            __m256 v2 = _mm256_loadu_ps(inptr_xi + ofstab[k + 2]);
+                            __m256 v3 = _mm256_loadu_ps(inptr_xi + ofstab[k + 3]);
+
+                            __m256 ww0 = _mm256_set1_ps(weights[k]);
+                            __m256 ww1 = _mm256_set1_ps(weights[k+1]);
+                            __m256 ww2 = _mm256_set1_ps(weights[k+2]);
+                            __m256 ww3 = _mm256_set1_ps(weights[k+3]);
+
+                            s0 = _mm256_fmadd_ps(v0, ww0, s0);
+                            s0 = _mm256_fmadd_ps(v1, ww1, s0);
+                            s0 = _mm256_fmadd_ps(v2, ww2, s0);
+                            s0 = _mm256_fmadd_ps(v3, ww3, s0);
+                        }
+                        for (; k < ksize; k++)
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[k]),
+                                                 _mm256_set1_ps(weights[k]), s0);
+
+                        if (ifMinMaxAct)
+                            s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                        _mm256_storeu_ps(outptr + x0, s0);
+                    }
+                }
+            }
+
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + W0 * yi_ + xi_;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int inp_ofs = ofstab[k];
+                        float w = weights[k];
+                        s_0 += inptr_xi[inp_ofs] * w;
+                        s_1 += inptr_xi[inp_ofs + Wi] * w;
+                        s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        s_0 += inptr_xi[ofstab[k]] * weights[k];
+                    }
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            x1 = W0;
+        }
+    }
+}
+#endif
+} // namespace opt_AVX2
+} // namespace cv
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
new file mode 100644
index 0000000000..139ea7f6fc
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -0,0 +1,694 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+#include "fast_convolution.simd.hpp"
+
+namespace cv { namespace dnn {
+
+Ptr<FastConv2d> initFastConv2d(
+        int ngroups,
+        int K, int C, int Hk, int Wk,
+        int stride_x, int stride_y,
+        int dilation_x, int dilation_y,
+        const std::vector<size_t>& pads_begin,
+        const std::vector<size_t>& pads_end,
+        float* srcWeights,
+        float* srcBias)
+{
+    Ptr<FastConv2d> conv = makePtr<FastConv2d>();
+
+    CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0);
+    CV_Assert(Hk > 0 && Wk > 0);
+    CV_Assert(stride_y > 0 && stride_x > 0);
+    CV_Assert(dilation_y > 0 && dilation_x > 0);
+
+    conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk;  // [K, iC, kH, kW]
+    conv->stride_y = stride_y;
+    conv->stride_x = stride_x;
+    conv->dilation_y = dilation_y;
+    conv->dilation_x = dilation_x;
+
+    conv->ngroups = ngroups;
+    conv->pad_top = pads_begin[0];
+    conv->pad_bottom = pads_end[0];
+    conv->pad_left = pads_begin[1];
+    conv->pad_right = pads_end[1];
+
+    // store bias; append some zero's to make sure that
+    // we can always read FAST_CONV_MR elements starting from any valid index
+    {
+        int k = 0, nbias = K + FAST_CONV_MR-1;
+        conv->biasBuf.reserve(nbias);
+        float* biasBufPtr = conv->biasBuf.data();
+        for(; k < K; k++)
+            biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
+        for(; k < nbias; k++)
+            biasBufPtr[k] = 0.f;
+    }
+
+#if CV_NEON // For now, winograd is ARM platform only.
+    if (ngroups == 1 && Hk ==3 && Wk == 3 && stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y ==1
+        && K >= 16 && C >= 16 )
+        conv->ifWinograd63 = true;
+#else
+    conv->ifWinograd63 = false;
+#endif
+
+    if (ngroups > 1 && ngroups == K && ngroups == C)
+    {
+        // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
+        // but add some padding to make the weights array layout more SIMD-friendly
+        int ksize = Hk*Wk;
+        int padded_ksize = ((ksize + FAST_VEC_NLANES-1)/FAST_VEC_NLANES)*FAST_VEC_NLANES;  // this code aims to let memory fit with vector size.
+        int nweights = C*padded_ksize;
+        conv->weightsBuf.reserve(nweights);
+        float* weightsBufPtr = conv->weightsBuf.data();
+        memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+        for(int c = 0; c < C; c++)
+        {
+            for (int k = 0; k < ksize; k++)
+                weightsBufPtr[c*padded_ksize + k] = srcWeights[c*ksize + k];
+        }
+    }
+    else
+    {
+        // The weights are packed as
+        // ngroups x (ceil((K/ngroups)/FAST_CONV_MR)*FAST_CONV_MR) x (Cg*Hk*Wk) x FAST_CONV_MR tensor
+        int Kg = K/ngroups, Cg = max(C/ngroups, 1);
+        int Kg_aligned = ((Kg + FAST_CONV_MR - 1)/FAST_CONV_MR)*FAST_CONV_MR;
+        size_t nweights = ngroups*Kg_aligned*Cg*Hk*Wk;
+        conv->weightsBuf.reserve(nweights);
+        float* weightsBufPtr = conv->weightsBuf.data();
+        memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+        float* packed_wptr = weightsBufPtr;
+
+        // pack the weight.
+        for(int g = 0; g < ngroups; g++)
+        {
+            for(int k0 = 0; k0 < Kg_aligned; k0 += FAST_CONV_MR)
+            {
+                int dk = Kg - k0 < FAST_CONV_MR ? Kg - k0 : FAST_CONV_MR;
+                for(int c = 0; c < Cg; c++)
+                {
+                    for(int yx = 0; yx < Hk*Wk; yx++, packed_wptr += FAST_CONV_MR)
+                    {
+                        const float* wptr = srcWeights + ((g*Kg + k0)*Cg + c)*Hk*Wk + yx;
+                        int k = 0;
+                        for(; k < dk; k++, wptr += Cg*Hk*Wk)
+                            packed_wptr[k] = *wptr;
+                        for(; k < FAST_CONV_MR; k++)
+                            packed_wptr[k] = 0.f;
+                    }
+                }
+            }
+        }
+
+        // Prepare Weight for Winograd F(6x6, 3x3)
+        if (conv->ifWinograd63)
+        {
+            initWinograd63(conv, srcWeights, K, C);
+        }
+    }
+    return conv;
+}
+
+static void packInput(float* inpbuf, const float* inptr, int* yxtab, int ksize, int Cg, int Hi, int Wi, int W0,
+                         int pad_top, int pad_left, int stride_x, int stride_y, int yx0, int slice_len,
+                         bool fast_1x1, bool partial0, bool s1d1p0, bool s1d1)
+{
+    const size_t inp_planesize = (size_t)Hi*Wi;
+
+    if (fast_1x1)
+    {
+        /*
+           super-fast branch for 1x1 convolutions with sy=sx=1.
+           in this case each feature plane can be safely treated
+           as 1D array and we just extract next portion
+           of FAST_CONV_NR elements from each feature plane and
+           put it together.
+        */
+        inptr += yx0;
+        if (!partial0)
+        {
+            // Make special branch where memcpy() is called with a constant buffer size.
+            // Compilers will likely unroll this loop properly.
+            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR)
+                memcpy(inpbuf, inptr, FAST_CONV_NR * sizeof(inpbuf[0]));
+        }
+        else
+        {
+            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR)
+            {
+                memcpy(inpbuf, inptr, slice_len * sizeof(inpbuf[0]));
+                memset(inpbuf + slice_len, 0, (FAST_CONV_NR - slice_len) * sizeof(inpbuf[0]));
+            }
+        }
+    }
+    else if (s1d1p0)
+    {
+        /*
+         slower, but still fast branch for sy=sx=1, dy=dx=1 and without padding,
+         in this case we copy data from input tensors by chunks.
+         */
+        for (int c = 0; c < Cg; c++)
+        {
+            float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize);
+            const float *inptr_c = inptr + c * inp_planesize;
+
+            for (int k = 0; k < ksize; k++)
+            {
+                int y0 = yx0 / W0, x0 = yx0 % W0;
+                int yi = y0 + yxtab[k * 2], xi = x0 + yxtab[k * 2 + 1];
+                float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR;
+                int xi_0 = yxtab[k * 2 + 1];
+
+                int i = 0;
+                for (; i < slice_len;)
+                {
+                    const float *inptr_k = inptr_c + yi * Wi + xi;
+                    int copy_len = std::min(slice_len - i, W0 - x0);
+                    int di_z = (slice_len == i + copy_len) ? FAST_CONV_NR - slice_len : 0;
+
+                    memcpy(inpbuf_k + i,
+                           inptr_k,
+                           copy_len * sizeof(inpbuf_k[0]));
+
+                    memset(inpbuf_k + i + copy_len,
+                           0, di_z * sizeof(inpbuf_k[0]));
+
+                    i += copy_len;
+                    x0 = 0;
+                    xi = xi_0;
+                    yi++;
+                }
+            }
+        }
+    }
+    else if (s1d1)
+    {
+        /*
+         slower, but still fast branch for sy=sx=1, dy=dx=1.
+         in this case we copy data from input tensors by chunks and
+         interleave the data in inpbuf with 0's
+         (that correspond to the padding elements) when necessary
+         */
+        int y0 = yx0 / W0, x0 = yx0 % W0;
+        for (int c = 0; c < Cg; c++)
+        {
+            float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize);
+            const float *inptr_c = inptr + c * inp_planesize;
+
+            for (int k = 0; k < ksize; k++)
+            {
+                int x0_tmp = x0;
+
+                int xi_0 = yxtab[k * 2 + 1] - pad_left;
+
+                int yi = y0 + yxtab[k * 2] - pad_top, xi = x0_tmp + xi_0;
+                float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR;
+
+                int i = 0;
+                for (; i < slice_len;) {
+                    int copyLen = std::min(slice_len - i, W0 - x0_tmp);
+
+                    int di_z = (i + copyLen == slice_len) ? FAST_CONV_NR - slice_len
+                                                          : 0; // The final padding.
+                    // pad_top or pad bottom
+                    if (yi < 0 || yi > Hi - 1)
+                    {
+                        memset(inpbuf_k + i,
+                               0, (copyLen + di_z) * sizeof(inpbuf_k[0]));
+                        i += copyLen + di_z;
+                    }
+                    else
+                    {
+                        int x_pad_left = 0, x_pad_right = 0;
+
+                        // pad_left
+                        if (xi < 0)
+                        {
+                            x_pad_left = std::min(-xi, copyLen);
+                            xi = 0;
+                            copyLen -= x_pad_left;
+                        }
+
+                        memset(inpbuf_k + i,
+                               0, x_pad_left * sizeof(inpbuf_k[0]));
+                        i += x_pad_left;
+
+                        // pad right
+                        if (xi + copyLen > Wi)
+                        {
+                            if (xi > Wi)
+                            {
+                                x_pad_right = copyLen;
+                                copyLen = 0;
+                            }
+                            else
+                            {
+                                x_pad_right = std::min(xi + copyLen - Wi, copyLen);
+                                copyLen -= x_pad_right;
+                            }
+                        }
+
+                        CV_Assert(copyLen >= 0);
+
+                        const float *inptr_k = inptr_c + yi * Wi + xi;
+                        memcpy(inpbuf_k + i,
+                               inptr_k,
+                               copyLen * sizeof(inpbuf_k[0]));
+
+                        i += copyLen;
+
+                        // pad_right and the final padding.
+                        memset(inpbuf_k + i,
+                               0, (di_z + x_pad_right) * sizeof(inpbuf_k[0]));
+                        i += x_pad_right + di_z;
+                    }
+
+                    x0_tmp = 0;
+                    xi = xi_0;
+                    yi++;
+                }
+            }
+        }
+    }
+    else
+    {
+        int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
+        for (int k = 0; k < ksize; k++)
+        {
+            int dy = yxtab[k * 2], dx = yxtab[k * 2 + 1];
+            int i = 0, y0 = y0_, x0 = x0_;
+            for (; i < FAST_CONV_NR;)
+            {
+                float *inpbuf_ki = inpbuf + k * FAST_CONV_NR + i;
+                int yi = y0 * stride_y + dy - pad_top;
+                int xi = x0 * stride_x + dx - pad_left;
+
+                if ((unsigned) yi < (unsigned) Hi &&
+                    (unsigned) xi < (unsigned) Wi)
+                {
+                    const float *inptr_ki = inptr + yi * Wi + xi;
+                    if (i + 4 <= FAST_CONV_NR && x0 + 4 <= W0 && xi + stride_x * 4 <= Wi)
+                    {
+                        if (stride_x == 2) {
+                            for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                      ksize, inptr_ki += inp_planesize)
+                            {
+                                float t0 = inptr_ki[0], t1 = inptr_ki[2];
+                                float t2 = inptr_ki[4], t3 = inptr_ki[6];
+                                inpbuf_ki[0] = t0;
+                                inpbuf_ki[1] = t1;
+                                inpbuf_ki[2] = t2;
+                                inpbuf_ki[3] = t3;
+                            }
+                        }
+                        else
+                        {
+                            for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                      ksize, inptr_ki += inp_planesize)
+                            {
+                                float t0 = inptr_ki[0], t1 = inptr_ki[stride_x];
+                                float t2 = inptr_ki[stride_x * 2], t3 = inptr_ki[stride_x * 3];
+                                inpbuf_ki[0] = t0;
+                                inpbuf_ki[1] = t1;
+                                inpbuf_ki[2] = t2;
+                                inpbuf_ki[3] = t3;
+                            }
+                        }
+                        i += 4;
+                        x0 += 4;
+                    }
+                    else
+                    {
+                        for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                  ksize, inptr_ki += inp_planesize)
+                            *inpbuf_ki = *inptr_ki;
+                        i++;
+                        x0++;
+                    }
+                }
+                else
+                {
+                    for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * ksize)
+                        inpbuf_ki[0] = 0.f;
+                    i++;
+                    x0++;
+                }
+                int mask = x0 >= W0;
+                y0 += mask;
+                x0 &= mask - 1;
+            }
+        }
+    }
+}
+
+static void matMulCompute(float* outptr0, float* inpbuf_task, float* cbuf, const Ptr<FastConv2d>& conv, int HkWkCg,
+                          int k0, int k1, int yx0, int yx1, size_t out_planesize, int g, int Kg, int Kg_aligned,
+                          bool partial0, ActivationLayer*& activ, float minval, float maxval, bool ifMinMaxAct)
+{
+    int outstep0 = out_planesize;
+
+    for (int k = k0; k < k1; k += FAST_CONV_MR, outptr0 += outstep0 * FAST_CONV_MR)
+    {
+        int dk = Kg - k < FAST_CONV_MR ? Kg - k : FAST_CONV_MR;
+        bool partial = partial0 || dk < FAST_CONV_MR;
+        float *outptr = outptr0;
+
+        int outstep = outstep0;
+        if (partial)
+        {
+            outptr = cbuf;
+            outstep = FAST_CONV_NR;
+        }
+
+
+#if CV_TRY_AVX2
+        if (conv->useAVX2)
+            opt_AVX2::convBlock_AVX2( HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                                  inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                                  minval, maxval, ifMinMaxAct);
+        else
+#endif
+#if CV_TRY_NEON
+        if (conv->useNEON)
+            opt_NEON::convBlock_NEON(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                                 inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                                 minval, maxval, ifMinMaxAct);
+        else
+#endif
+            convBlock(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                            inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                            minval, maxval, ifMinMaxAct);
+
+        // activation
+        if (activ)
+            activ->forwardSlice(outptr, outptr, yx1 - yx0, outstep, Kg * g + k,
+                                Kg * g + k + dk);
+
+        if (partial)
+        {
+            for (int i = 0; i < dk; i++)
+                memcpy(outptr0 + i * outstep0, cbuf + i * FAST_CONV_NR,
+                       (yx1 - yx0) * sizeof(cbuf[0]));
+        }
+    }
+}
+
+void runFastConv2d(InputArray _input, OutputArray _output,
+                   const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    ActivationLayer* activ = 0;
+    float minval = -FLT_MAX, maxval = FLT_MAX;
+    bool ifMinMaxAct = false;
+    if (actLayer)
+    {
+        Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
+        Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
+
+        if (!activ_relu.empty())
+        {
+            if (activ_relu->negativeSlope == 0.0f)
+            {
+                minval = 0.0f;
+                ifMinMaxAct = true;
+                activ = nullptr;
+            }
+            else // Leaky ReLU
+            {
+                activ = actLayer.get();
+            }
+        }
+        else if (!activ_relu6.empty())
+        {
+            minval = activ_relu6->minValue;
+            maxval = activ_relu6->maxValue;
+
+            ifMinMaxAct = true;
+            activ = nullptr;
+        }
+        else
+            activ = actLayer.get();
+    }
+    else
+        activ = nullptr;
+
+    if (conv->ngroups  > 1 && conv->ngroups == conv->K && conv->ngroups == conv->C)
+    {
+        return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct);
+    }
+
+#if CV_NEON
+    if ( conv->ifWinograd63
+         && inputShape[2] > 12 && inputShape[3] > 12
+         && inputShape[2] < 120 && inputShape[3] < 120 )
+    {
+        // In general, for winograd branch, more cores will give better performance.
+        int maxNumThread = std::max(getNumThreads(), 1);
+        if (runWinograd63(input, output, conv, maxNumThread, minval, maxval, activ, ifMinMaxAct))
+            return;
+    }
+#endif
+
+    float* inp = input.ptr<float>();
+    float* out = output.ptr<float>();
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
+    int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;         // ngroups
+    int Cg = C/ngroups, Kg = K/ngroups;
+    int Kg_nblocks = (Kg + FAST_CONV_MR-1)/FAST_CONV_MR, Kg_aligned = Kg_nblocks*FAST_CONV_MR; // align to MR
+
+    const size_t inp_planesize = (size_t)Hi*Wi;
+    const size_t out_planesize = (size_t)H0*W0;
+
+    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
+    int pad_left = conv->pad_left;
+    int pad_right = conv->pad_right;
+
+    int stride_y = conv->stride_y, stride_x = conv->stride_x;
+    int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
+
+    int ksize = Hk * Wk;
+    bool s1d1 = stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y == 1;
+    bool s1d1p0 = s1d1 && pad_top == 0 && pad_left ==0 && pad_bottom == 0 && pad_right == 0;
+    bool fast_1x1 = stride_x == 1 && stride_y == 1 && ksize == 1;
+    int HkWkCg = Hk*Wk*Cg;
+
+    enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
+    size_t taskbufsize = FAST_CONV_NR*HkWkCg; // input buffer
+    size_t taskbufsizeOutput = FAST_CONV_NR * FAST_CONV_MR;
+    size_t inputbufsize = 0;
+    size_t outbufsize = ntasks * taskbufsizeOutput;
+
+    int stripes_per_sample = (out_planesize + FAST_CONV_NR - 1)/FAST_CONV_NR; // align to NR
+    size_t hw_task = stripes_per_sample;
+    size_t hw_aligned = stripes_per_sample * FAST_CONV_NR;
+
+    bool separatedLoop = false;
+
+    if (stripes_per_sample < 4 * ntasks)
+    {
+        // If stripes_per_sample is small, we parallelize on K (output channel).
+        stripes_per_sample = 1;
+
+        // Separated Parallelloop could save much time in packing input data. But it may cost more memory, we use it when batch size is 1.
+        if (N == 1)
+        {
+            separatedLoop = true;
+            inputbufsize = ngroups * hw_aligned * HkWkCg;
+        }
+
+        if (!separatedLoop)
+        {
+            inputbufsize = taskbufsize * ntasks;
+        }
+    }
+    else
+    {
+        // If stripes_per_sample is big, we parallelize on H0*W0.
+        Kg_nblocks = 1;
+        inputbufsize = taskbufsize * ntasks;
+    }
+
+    int Kstripes = Kg_nblocks*stripes_per_sample;
+    int nsubtasks = N*ngroups*Kstripes;
+
+    AutoBuffer<float> inpbuf_all_, outputbuf_;
+    inputbufsize = alignSize(inputbufsize, VEC_ALIGN);
+    inpbuf_all_.allocate(inputbufsize + VEC_ALIGN);
+    float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(float)));
+
+    outbufsize = alignSize(outbufsize, VEC_ALIGN);
+    outputbuf_.allocate(outbufsize + VEC_ALIGN);
+    float* output_buf = alignPtr(outputbuf_.data(), (int)(VEC_ALIGN*sizeof(float)));
+
+    std::vector<int> ofstab_(Hk*Wk*3, 0);
+    int* ofstab = ofstab_.data();
+    int* yxtab = ofstab + Hk*Wk;
+
+    for (int y = 0; y < Hk; y++)
+        for( int x = 0; x < Wk; x++)
+        {
+            int k = y*Wk + x;
+            int dy = y*dilation_y, dx = x*dilation_x;
+            yxtab[k*2] = dy;
+            yxtab[k*2+1] = dx;
+            ofstab[k] = dy*Wi + dx;
+        }
+
+    if (ksize == 1)
+    {
+        CV_Assert(pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0);
+        CV_Assert(stride_x != 1 || stride_y != 1 || (H0 == Hi && W0 == Wi));
+    }
+
+    if (separatedLoop)
+    {
+        // For now this branch only handles batch size = 1. Maybe we could support batch size < 10 in the future.
+        // Pack Input data
+        parallel_for_(Range(0, ngroups * hw_task), [&](const Range& r0)
+        {
+            for (int nhwi = r0.start; nhwi < r0.end; nhwi++)
+            {
+                int g = nhwi/hw_task;
+                int hw_i = nhwi % hw_task;
+                int hw0 = hw_i * FAST_CONV_NR;
+                float* inpbuf = inpbuf_all + g * hw_aligned * HkWkCg + hw0 * HkWkCg;
+                const float* inptr = inp + g * Cg * inp_planesize;
+                bool partial0 = hw0 + FAST_CONV_NR > out_planesize? true: false;
+                int slice_len = FAST_CONV_NR;
+
+                if (partial0)
+                    slice_len = out_planesize - hw0;
+
+                packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y,
+                          hw0, slice_len, fast_1x1, partial0, s1d1p0, s1d1);
+            }
+        });
+
+        // Compute
+        parallel_for_(Range(0, ntasks), [&](const Range& r0)
+        {
+            for (int task_id = r0.start; task_id < r0.end; task_id++)
+            {
+                float *cbuf = output_buf + task_id * taskbufsizeOutput;
+                int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks);
+                int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks);
+                for (int subtask = ngs0; subtask < ngs1;)
+                {
+                    int ng = subtask / Kstripes;
+                    int kyx0 = subtask - ng * Kstripes;
+                    int kyx1 = kyx0 + (ngs1 - subtask);
+                    int n = ng / ngroups, g = ng - n * ngroups;
+
+                    CV_Assert(n <= 1);
+
+                    kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
+                    subtask += kyx1 - kyx0;
+
+                    int k0 = kyx0 * FAST_CONV_MR;
+                    int k1 = kyx1 * FAST_CONV_MR;
+                    k1 = k1 <= Kg ? k1 : Kg;
+
+
+                    for (int yx0 = 0; yx0 < out_planesize; yx0 += FAST_CONV_NR)
+                    {
+                        float* inpbuf_task = inpbuf_all + g * hw_aligned * HkWkCg + yx0 * HkWkCg;
+                        int yx1 = yx0 + FAST_CONV_NR;
+                        yx1 = yx1 <= out_planesize ? yx1 : out_planesize;
+                        int slice_len = yx1 - yx0;
+                        bool partial0 = slice_len < FAST_CONV_NR;
+
+                        int outstep0 = out_planesize;
+                        size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0;
+                        float *outptr0 = out + outofs;
+
+                        matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g,
+                                      Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct);
+                    }
+                }
+            }
+        });
+    }
+    else
+    {
+        parallel_for_(Range(0, ntasks), [&](const Range &r0) {
+            for (int task_id = r0.start; task_id < r0.end; task_id++) {
+                float *inpbuf_task = &inpbuf_all[taskbufsize * task_id];
+                float *cbuf = output_buf + task_id * taskbufsizeOutput;
+                int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks);
+                int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks);
+
+                for (int subtask = ngs0; subtask < ngs1;)
+                {
+                    int ng = subtask / Kstripes;
+                    int kyx0 = subtask - ng * Kstripes;
+                    int kyx1 = kyx0 + (ngs1 - subtask);
+                    int n = ng / ngroups, g = ng - n * ngroups;
+                    size_t inp_plane_ofs = (size_t) (n * ngroups + g) * Cg * inp_planesize;
+                    kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
+                    subtask += kyx1 - kyx0;
+                    int k0, k1;
+                    int yx0, yx_limit;
+
+                    if (stripes_per_sample == 1)
+                    {
+                        k0 = kyx0 * FAST_CONV_MR;
+                        k1 = kyx1 * FAST_CONV_MR;
+                        k1 = k1 <= Kg ? k1 : Kg;
+                        yx0 = 0;
+                        yx_limit = out_planesize;
+                    }
+                    else
+                    {
+                        k0 = 0;
+                        k1 = Kg;
+                        yx0 = kyx0 * FAST_CONV_NR;
+                        yx_limit = kyx1 * FAST_CONV_NR;
+                        yx_limit = yx_limit < out_planesize ? yx_limit : out_planesize;
+                    }
+
+                    for (; yx0 < yx_limit; yx0 += FAST_CONV_NR)
+                    {
+                        float *inpbuf = inpbuf_task;
+                        const float *inptr = inp + inp_plane_ofs;
+                        int yx1 = yx0 + FAST_CONV_NR;
+                        yx1 = yx1 <= yx_limit ? yx1 : yx_limit;
+                        int slice_len = yx1 - yx0;
+                        bool partial0 = slice_len < FAST_CONV_NR;
+                        packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y,
+                                     yx0, slice_len, fast_1x1, partial0, s1d1p0, s1d1);
+
+                        // 2. do convolution, compute Kg x (yx1 - yx0) part of the output tensor
+                        int outstep0 = out_planesize;
+                        size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0;
+                        float *outptr0 = out + outofs;
+
+                        matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g,
+                                      Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct);
+                    }
+                }
+            }
+        });
+    }
+}
+
+}} // namespace cv::dnn
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
new file mode 100644
index 0000000000..30c5ea2009
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@@ -0,0 +1,89 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_FAST_CONVOLUTION_HPP
+#define OPENCV_FAST_CONVOLUTION_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+
+#ifndef FAST_CONV_PRAM
+#define FAST_CONV_PRAM
+#if CV_NEON && __aarch64__  // 32 registers.
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 28
+enum { FAST_VEC_NLANES=4 };
+#elif CV_NEON              // 16 registers.
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 12
+enum { FAST_VEC_NLANES=4 };
+#else // SIMD 128, AVX or AVX2
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 24
+enum { FAST_VEC_NLANES=4 };
+#endif
+#endif
+
+namespace cv {
+namespace dnn {
+
+struct FastConv2d
+{
+    int ngroups;
+    int K, C, Hk, Wk;
+    int stride_y, stride_x;
+    int dilation_y, dilation_x;
+    int pad_top, pad_bottom, pad_left, pad_right;
+
+    std::vector<float> weightsBuf;        // For generic Conv 2D
+    std::vector<float> weightsWino63Buf;  // For Winograd F(6x6, 3x3).
+
+    std::vector<float> biasBuf;
+    bool ifWinograd63 = false;
+    bool useAVX2 = checkHardwareSupport(CPU_AVX2);
+    bool useNEON = checkHardwareSupport(CPU_NEON);
+};
+
+// return a FastConv2d instance.
+Ptr<FastConv2d> initFastConv2d(
+        int ngroups,
+        int K, int C, int Hk, int Wk,
+        int stride_x, int stride_y,
+        int dilation_x, int dilation_y,
+        const std::vector<size_t>& pads_begin,
+        const std::vector<size_t>& pads_end,
+        float* srcWeights,
+        float* srcBias);
+
+// It contains different computing branches, like winograd, 1x1 conv.
+void runFastConv2d(InputArray _input, OutputArray _output,
+                 const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer);
+
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
+        ActivationLayer* activ, bool ifMinMaxAct);
+
+// winograd init
+void initWinograd63(Ptr<FastConv2d>& conv, float* src_weight, int K, int C);
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
+                  float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
+
+} // namespace dnn
+
+namespace opt_AVX2
+{
+#if CV_TRY_AVX2
+void convBlock_AVX2(int k, const float *a, const float *b,
+        float *c, int ldc, const float *bias,
+        float minval, float maxval, bool ifActiv);
+
+void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3);
+#endif
+} // namespace opt_AVX2
+
+} // namespace cv
+
+#endif //OPENCV_FAST_CONVOLUTION_HPP
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
new file mode 100644
index 0000000000..f1542901e7
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
@@ -0,0 +1,342 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP
+#define OPENCV_FAST_CONVOLUTION_SIMD_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+namespace cv {
+namespace dnn {
+
+void convBlock(int k, const float *a, const float *b,
+        float *c, int ldc, const float *bias,
+        float minval, float maxval, bool ifActiv)
+{
+#if CV_SIMD128
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
+    {
+        v_float32x4 c0 = v_setall_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
+        v_float32x4 c6 = v_setall_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
+        v_float32x4 c12 = v_setall_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
+        v_float32x4 c18 = v_setall_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+        {
+            v_float32x4 a0 = v_setall_f32(a[0]);
+            v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+            v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
+
+            c0 = v_fma(b0, a0, c0);
+            c1 = v_fma(b1, a0, c1);
+            c2 = v_fma(b2, a0, c2);
+            c3 = v_fma(b3, a0, c3);
+            c4 = v_fma(b4, a0, c4);
+            c5 = v_fma(b5, a0, c5);
+
+            a0  = v_setall_f32(a[1]);
+            c6  = v_fma(b0, a0, c6);
+            c7  = v_fma(b1, a0, c7);
+            c8  = v_fma(b2, a0, c8);
+            c9  = v_fma(b3, a0, c9);
+            c10 = v_fma(b4, a0, c10);
+            c11 = v_fma(b5, a0, c11);
+
+            a0 = v_setall_f32(a[2]);
+            c12 = v_fma(b0, a0, c12);
+            c13 = v_fma(b1, a0, c13);
+            c14 = v_fma(b2, a0, c14);
+            c15 = v_fma(b3, a0, c15);
+            c16 = v_fma(b4, a0, c16);
+            c17 = v_fma(b5, a0, c17);
+
+            a0 = v_setall_f32(a[3]);
+            c18 = v_fma(b0, a0, c18);
+            c19 = v_fma(b1, a0, c19);
+            c20 = v_fma(b2, a0, c20);
+            c21 = v_fma(b3, a0, c21);
+            c22 = v_fma(b4, a0, c22);
+            c23 = v_fma(b5, a0, c23);
+        }
+
+        if (ifActiv) {
+            v_float32x4 vmin = v_setall_f32(minval), vmax = v_setall_f32(maxval);
+            c0 = v_min(v_max(c0, vmin), vmax);
+            c1 = v_min(v_max(c1, vmin), vmax);
+            c2 = v_min(v_max(c2, vmin), vmax);
+            c3 = v_min(v_max(c3, vmin), vmax);
+            c4 = v_min(v_max(c4, vmin), vmax);
+            c5 = v_min(v_max(c5, vmin), vmax);
+            c6 = v_min(v_max(c6, vmin), vmax);
+            c7 = v_min(v_max(c7, vmin), vmax);
+            c8 = v_min(v_max(c8, vmin), vmax);
+            c9 = v_min(v_max(c9, vmin), vmax);
+            c10 = v_min(v_max(c10, vmin), vmax);
+            c11 = v_min(v_max(c11, vmin), vmax);
+            c12 = v_min(v_max(c12, vmin), vmax);
+            c13 = v_min(v_max(c13, vmin), vmax);
+            c14 = v_min(v_max(c14, vmin), vmax);
+            c15 = v_min(v_max(c15, vmin), vmax);
+            c16 = v_min(v_max(c16, vmin), vmax);
+            c17 = v_min(v_max(c17, vmin), vmax);
+            c18 = v_min(v_max(c18, vmin), vmax);
+            c19 = v_min(v_max(c19, vmin), vmax);
+            c20 = v_min(v_max(c20, vmin), vmax);
+            c21 = v_min(v_max(c21, vmin), vmax);
+            c22 = v_min(v_max(c22, vmin), vmax);
+            c23 = v_min(v_max(c23, vmin), vmax);
+        }
+        v_store(c, c0);
+        v_store(c + 4, c1);
+        v_store(c + 8, c2);
+        v_store(c + 12, c3);
+        v_store(c + 16, c4);
+        v_store(c + 20, c5);
+
+        v_store(c + ldc, c6);
+        v_store(c + ldc + 4, c7);
+        v_store(c + ldc + 8, c8);
+        v_store(c + ldc + 12, c9);
+        v_store(c + ldc + 16, c10);
+        v_store(c + ldc + 20, c11);
+
+        v_store(c + ldc * 2, c12);
+        v_store(c + ldc * 2 + 4, c13);
+        v_store(c + ldc * 2 + 8, c14);
+        v_store(c + ldc * 2 + 12, c15);
+        v_store(c + ldc * 2 + 16, c16);
+        v_store(c + ldc * 2 + 20, c17);
+
+        v_store(c + ldc * 3, c18);
+        v_store(c + ldc * 3 + 4, c19);
+        v_store(c + ldc * 3 + 8, c20);
+        v_store(c + ldc * 3 + 12, c21);
+        v_store(c + ldc * 3 + 16, c22);
+        v_store(c + ldc * 3 + 20, c23);
+    }
+#endif
+#else
+    for (int i = 0; i < FAST_CONV_MR; i++)
+    {
+        float beta = bias[i];
+        for (int j = 0; j < FAST_CONV_NR; j++)
+            c[i*ldc + j] = beta;
+    }
+    for (int p = 0; p < k; p++)
+    {
+        for (int i = 0; i < FAST_CONV_MR; i++)
+        {
+            float alpha = a[FAST_CONV_MR*p + i];
+            for (int j = 0; j < FAST_CONV_NR; j++)
+            {
+                c[i*ldc+j] += b[FAST_CONV_NR*p + j]*alpha;
+            }
+        }
+    }
+    if (ifActiv)
+    {
+        for (int i = 0; i < FAST_CONV_MR; i++)
+        {
+            for (int j = 0; j < FAST_CONV_NR; j++)
+            {
+                float v = c[i*ldc + j];
+                v = std::min(std::max(v, minval), maxval);
+                c[i*ldc + j] = v;
+            }
+        }
+    }
+#endif
+}
+} // namespace dnn
+
+namespace opt_NEON
+{
+#if CV_TRY_NEON
+void convBlock_NEON(int k, const float *a, const float *b,
+                float *c, int ldc, const float *bias,
+                float minval, float maxval, bool ifActiv)
+{
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 12
+    {
+        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0;
+        float32x4_t c3 = vdupq_n_f32(bias[1]), c4 = c3, c5 = c3;
+        float32x4_t c6 = vdupq_n_f32(bias[2]), c7 = c6, c8 = c6;
+        float32x4_t c9 = vdupq_n_f32(bias[3]), c10 = c9, c11 = c9;
+
+        float32x4_t a0 = vdupq_n_f32(0.0f);
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+        {
+            a0 = vld1q_f32(a);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+            c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
+            c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
+            c2 = vfmaq_laneq_f32(c2, b2, a0, 0);
+            c3 = vfmaq_laneq_f32(c3, b0, a0, 1);
+            c4 = vfmaq_laneq_f32(c4, b1, a0, 1);
+            c5 = vfmaq_laneq_f32(c5, b2, a0, 1);
+
+            c6 = vfmaq_laneq_f32(c6, b0, a0, 2);
+            c7 = vfmaq_laneq_f32(c7, b1, a0, 2);
+            c8 = vfmaq_laneq_f32(c8, b2, a0, 2);
+
+            c9 = vfmaq_laneq_f32(c9, b0, a0, 3);
+            c10 = vfmaq_laneq_f32(c10, b1, a0, 3);
+            c11 = vfmaq_laneq_f32(c11, b2, a0, 3);
+        }
+
+        if (ifActiv)
+        {
+            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
+            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
+            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
+            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
+            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
+            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
+            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
+            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
+            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
+            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
+            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
+            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
+            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
+        }
+        vst1q_f32(c, c0); vst1q_f32(c+4, c1); vst1q_f32(c+8, c2);
+        vst1q_f32(c + ldc, c3); vst1q_f32(c + ldc + 4, c4); vst1q_f32(c + ldc + 8, c5);
+        vst1q_f32(c + ldc*2, c6); vst1q_f32(c + ldc*2 + 4, c7); vst1q_f32(c + ldc*2 + 8, c8);
+        vst1q_f32(c + ldc*3, c9); vst1q_f32(c + ldc*3 + 4, c10); vst1q_f32(c + ldc*3 + 8, c11);
+    }
+#elif FAST_CONV_MR == 4 && FAST_CONV_NR == 28
+    {
+        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0, c24 = c0;
+        float32x4_t c6 = vdupq_n_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6, c25 = c6;
+        float32x4_t c12 = vdupq_n_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12, c26 = c12;
+        float32x4_t c18 = vdupq_n_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18, c27 = c18;
+
+        float32x4_t a0 = vdupq_n_f32(0.0f);
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR) {
+            a0 = vld1q_f32(a);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            b += 12;
+
+            c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
+            c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
+            c2 = vfmaq_laneq_f32(c2, b2, a0, 0);
+            c6 = vfmaq_laneq_f32(c6, b0, a0, 1);
+            c7 = vfmaq_laneq_f32(c7, b1, a0, 1);
+            c8 = vfmaq_laneq_f32(c8, b2, a0, 1);
+            c12 = vfmaq_laneq_f32(c12, b0, a0, 2);
+            c13 = vfmaq_laneq_f32(c13, b1, a0, 2);
+            c14 = vfmaq_laneq_f32(c14, b2, a0, 2);
+            c18 = vfmaq_laneq_f32(c18, b0, a0, 3);
+            c19 = vfmaq_laneq_f32(c19, b1, a0, 3);
+            c20 = vfmaq_laneq_f32(c20, b2, a0, 3);
+
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            b += 12;
+
+            c3 = vfmaq_laneq_f32(c3, b0, a0, 0);
+            c4 = vfmaq_laneq_f32(c4, b1, a0, 0);
+            c5 = vfmaq_laneq_f32(c5, b2, a0, 0);
+
+            c9 = vfmaq_laneq_f32(c9, b0, a0, 1);
+            c10 = vfmaq_laneq_f32(c10, b1, a0, 1);
+            c11 = vfmaq_laneq_f32(c11, b2, a0, 1);
+
+            c15 = vfmaq_laneq_f32(c15, b0, a0, 2);
+            c16 = vfmaq_laneq_f32(c16, b1, a0, 2);
+            c17 = vfmaq_laneq_f32(c17, b2, a0, 2);
+
+            c21 = vfmaq_laneq_f32(c21, b0, a0, 3);
+
+            b0 = vld1q_f32(b);
+            b += 4;
+
+            c22 = vfmaq_laneq_f32(c22, b1, a0, 3);
+            c23 = vfmaq_laneq_f32(c23, b2, a0, 3);
+
+            c24 = vfmaq_laneq_f32(c24, b0, a0, 0);
+            c25 = vfmaq_laneq_f32(c25, b0, a0, 1);
+            c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
+            c27 = vfmaq_laneq_f32(c27, b0, a0, 3);
+        }
+
+        if (ifActiv) {
+            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
+            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
+            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
+            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
+            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
+            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
+            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
+            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
+            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
+            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
+            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
+            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
+            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
+            c12 = vminq_f32(vmaxq_f32(c12, b0), b1);
+            c13 = vminq_f32(vmaxq_f32(c13, b0), b1);
+            c14 = vminq_f32(vmaxq_f32(c14, b0), b1);
+            c15 = vminq_f32(vmaxq_f32(c15, b0), b1);
+            c16 = vminq_f32(vmaxq_f32(c16, b0), b1);
+            c17 = vminq_f32(vmaxq_f32(c17, b0), b1);
+            c18 = vminq_f32(vmaxq_f32(c18, b0), b1);
+            c19 = vminq_f32(vmaxq_f32(c19, b0), b1);
+            c20 = vminq_f32(vmaxq_f32(c20, b0), b1);
+            c21 = vminq_f32(vmaxq_f32(c21, b0), b1);
+            c22 = vminq_f32(vmaxq_f32(c22, b0), b1);
+            c23 = vminq_f32(vmaxq_f32(c23, b0), b1);
+            c24 = vminq_f32(vmaxq_f32(c24, b0), b1);
+            c25 = vminq_f32(vmaxq_f32(c25, b0), b1);
+            c26 = vminq_f32(vmaxq_f32(c26, b0), b1);
+            c27 = vminq_f32(vmaxq_f32(c27, b0), b1);
+        }
+        vst1q_f32(c, c0);
+        vst1q_f32(c + 4, c1);
+        vst1q_f32(c + 8, c2);
+        vst1q_f32(c + 12, c3);
+        vst1q_f32(c + 16, c4);
+        vst1q_f32(c + 20, c5);
+        vst1q_f32(c + 24, c24);
+
+        vst1q_f32(c + ldc, c6);
+        vst1q_f32(c + ldc + 4, c7);
+        vst1q_f32(c + ldc + 8, c8);
+        vst1q_f32(c + ldc + 12, c9);
+        vst1q_f32(c + ldc + 16, c10);
+        vst1q_f32(c + ldc + 20, c11);
+        vst1q_f32(c + ldc + 24, c25);
+
+        vst1q_f32(c + ldc * 2, c12);
+        vst1q_f32(c + ldc * 2 + 4, c13);
+        vst1q_f32(c + ldc * 2 + 8, c14);
+        vst1q_f32(c + ldc * 2 + 12, c15);
+        vst1q_f32(c + ldc * 2 + 16, c16);
+        vst1q_f32(c + ldc * 2 + 20, c17);
+        vst1q_f32(c + ldc * 2 + 24, c26);
+
+        vst1q_f32(c + ldc * 3, c18);
+        vst1q_f32(c + ldc * 3 + 4, c19);
+        vst1q_f32(c + ldc * 3 + 8, c20);
+        vst1q_f32(c + ldc * 3 + 12, c21);
+        vst1q_f32(c + ldc * 3 + 16, c22);
+        vst1q_f32(c + ldc * 3 + 20, c23);
+        vst1q_f32(c + ldc * 3 + 24, c27);
+    }
+#else
+#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_NEON."
+#endif
+}
+
+#endif
+} // namespace opt_NEON
+
+} // namespace cv
+#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
new file mode 100644
index 0000000000..7a0720f55e
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
@@ -0,0 +1,1351 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+Winograd-based convolution F(6x6, 3x3).
+The code has been borrowed from ncnn inference engine (https://github.com/Tencent/ncnn)
+and adapted for OpenCV by Zihao Mu.
+
+Below is the original copyright
+*/
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv { namespace dnn {
+enum
+{
+    WINO_STEP=6,
+    WINO_KSIZE=3,
+    WINO_SIZE= WINO_STEP + WINO_KSIZE - 1,
+    WINO_AREA= WINO_SIZE * WINO_SIZE
+};
+
+#if CV_NEON
+static void winograd_trans_input_F63(float* src, float* dst, int Channle_div4, const int tiles, const int big_step, const int line_step, const int* ofstab0)
+{
+    // const float itm[8][8] = {
+    //     {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
+    //
+    //     {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
+    //     {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
+    //     {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
+    //
+    //     {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
+    // };
+
+    // 0 = r00 - r06 + (r04 - r02) * 5.25
+    // 7 = r07 - r01 + (r03 - r05) * 5.25
+
+    // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
+    // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
+
+    // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+    // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+
+    // reuse r04 * 1.25
+    // reuse r03 * 2.5
+    // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+    // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+
+    float tmp[8][8][FAST_VEC_NLANES];
+    AutoBuffer<float> input_buf0_;
+    input_buf0_.allocate(64 * tiles * FAST_VEC_NLANES);
+
+    float* input_buf0 = input_buf0_.data();
+    memset(input_buf0, 0, 64 * tiles * FAST_VEC_NLANES * sizeof(float ));
+
+    for (int ti = 0; ti < tiles; ti++)
+    {
+        float* input0 = src + ti *  64 * 4;
+        float* input = input0;
+        for (int m = 0; m < 8; m++)
+        {
+            float32x4_t _r00 = vld1q_f32(input);
+            float32x4_t _r01 = vld1q_f32(input + 4);
+            float32x4_t _r02 = vld1q_f32(input + 8);
+            float32x4_t _r03 = vld1q_f32(input + 12);
+            float32x4_t _r04 = vld1q_f32(input + 16);
+            float32x4_t _r05 = vld1q_f32(input + 20);
+            float32x4_t _r06 = vld1q_f32(input + 24);
+            float32x4_t _r07 = vld1q_f32(input + 28);
+
+            float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
+            float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
+            vst1q_f32(tmp[0][m], _tmp0m);
+            vst1q_f32(tmp[7][m], _tmp7m);
+
+            float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
+            float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
+
+            float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
+            float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
+            vst1q_f32(tmp[1][m], _tmp1m);
+            vst1q_f32(tmp[2][m], _tmp2m);
+
+            float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
+            float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
+
+            float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
+            float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
+            vst1q_f32(tmp[3][m], _tmp3m);
+            vst1q_f32(tmp[4][m], _tmp4m);
+
+            float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
+            float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
+
+            float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
+            float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
+            vst1q_f32(tmp[5][m], _tmp5m);
+            vst1q_f32(tmp[6][m], _tmp6m);
+
+            input += 8 * FAST_VEC_NLANES;
+        }
+
+        float* input_buf00 = input_buf0 + ti * 4;
+        float* input_buf01 = input_buf00 + tiles * 4;
+        float* input_buf02 = input_buf00 + tiles * 8;
+        float* input_buf03 = input_buf00 + tiles * 12;
+        float* input_buf04 = input_buf00 + tiles * 16;
+        float* input_buf05 = input_buf00 + tiles * 20;
+        float* input_buf06 = input_buf00 + tiles * 24;
+        float* input_buf07 = input_buf00 + tiles * 28;
+
+        for (int m = 0; m < 8; m++)
+        {
+            float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
+            float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
+            float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
+            float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
+            float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
+            float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
+            float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
+            float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
+
+            float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
+            float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
+
+            float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
+            float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
+
+            float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
+            float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
+
+            float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
+            float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
+
+            float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
+            float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
+
+            float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
+            float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
+
+            float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
+            float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
+
+            vst1q_f32(input_buf00,  _r0tm0);
+            vst1q_f32(input_buf01,  _r0tm1);
+            vst1q_f32(input_buf02,  _r0tm2);
+            vst1q_f32(input_buf03, _r0tm3);
+            vst1q_f32(input_buf04, _r0tm4);
+            vst1q_f32(input_buf05, _r0tm5);
+            vst1q_f32(input_buf06, _r0tm6);
+            vst1q_f32(input_buf07, _r0tm7);
+
+            input_buf00 += tiles * 32;
+            input_buf01 += tiles * 32;
+            input_buf02 += tiles * 32;
+            input_buf03 += tiles * 32;
+            input_buf04 += tiles * 32;
+            input_buf05 += tiles * 32;
+            input_buf06 += tiles * 32;
+            input_buf07 += tiles * 32;
+        }
+    }
+
+    // [line Number, input pack]
+    // if InpPack == 8;
+    for (int r = 0; r < 64; r++)
+    {
+        int ti = 0;
+        float* out0 = dst + r * big_step;
+        float* input0 = input_buf0 + 4 * tiles * r;
+
+        // TODO! support tiles > 12
+//#if (ARMV8)
+//        for (; ti + 11 < tiles; ti += 12)
+//        {
+//            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+////            std::cout<<"ofstab0[ti * 2] = "<<ofstab0[ti * 2]<<", ofstab0[ti * 2 + 1] = "<<ofstab0[ti * 2 + 1]<<std::endl;
+//            float* input1 = input0 + ti * 4;
+//            memcpy(out1, input1, 12 * 4 * sizeof(float ));
+//        }
+//#endif
+        for (; ti + 7 < tiles; ti += 8)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 8 * 4 * sizeof(float ));
+        }
+
+        for (; ti + 3 < tiles; ti += 4)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 4 * 4 * sizeof(float ));
+        }
+
+        for (; ti + 1 < tiles; ti += 2)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 2 * 4 * sizeof(float ));
+        }
+
+        for (; ti < tiles; ti++)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 1 * 4 * sizeof(float ));
+        }
+    }
+}
+
+static void winograd_trans_output_F63(float* src_, float* bias_, float minval, float maxval, bool ifMinMaxAct)
+{
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    float32x4_t bias0 = bias_ ? vld1q_f32(bias_) : vdupq_n_f32(0.f);
+    float tmp[6][8][4];
+
+    for (int m = 0; m < 8; m++)
+    {
+        float* output0 = src_ + 8 * m * FAST_VEC_NLANES;
+
+        float32x4_t _out0tm0 = vld1q_f32(output0);
+        float32x4_t _out0tm1 = vld1q_f32(output0 + FAST_VEC_NLANES * 1);
+        float32x4_t _out0tm2 = vld1q_f32(output0 + FAST_VEC_NLANES * 2);
+        float32x4_t _out0tm3 = vld1q_f32(output0 + FAST_VEC_NLANES * 3);
+        float32x4_t _out0tm4 = vld1q_f32(output0 + FAST_VEC_NLANES * 4);
+        float32x4_t _out0tm5 = vld1q_f32(output0 + FAST_VEC_NLANES * 5);
+        float32x4_t _out0tm6 = vld1q_f32(output0 + FAST_VEC_NLANES * 6);
+        float32x4_t _out0tm7 = vld1q_f32(output0 + FAST_VEC_NLANES * 7);
+
+        float32x4_t _tmp024a = vaddq_f32(_out0tm1, _out0tm2);
+        float32x4_t _tmp135a = vsubq_f32(_out0tm1, _out0tm2);
+
+        float32x4_t _tmp024b = vaddq_f32(_out0tm3, _out0tm4);
+        float32x4_t _tmp135b = vsubq_f32(_out0tm3, _out0tm4);
+
+        float32x4_t _tmp024c = vaddq_f32(_out0tm5, _out0tm6);
+        float32x4_t _tmp135c = vsubq_f32(_out0tm5, _out0tm6);
+
+        float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f));
+        float32x4_t _tmp2m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f);
+        float32x4_t _tmp4m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f);
+        vst1q_f32(tmp[0][m], _tmp0m);
+        vst1q_f32(tmp[2][m], _tmp2m);
+        vst1q_f32(tmp[4][m], _tmp4m);
+
+        float32x4_t _tmp1m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f);
+        float32x4_t _tmp3m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f);
+        float32x4_t _tmp5m = vaddq_f32(vaddq_f32(_out0tm7, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f));
+        vst1q_f32(tmp[1][m], _tmp1m);
+        vst1q_f32(tmp[3][m], _tmp3m);
+        vst1q_f32(tmp[5][m], _tmp5m);
+    }
+
+    for (int m = 0; m < 6; m++)
+    {
+        float* output0 = src_ + 6 * m * FAST_VEC_NLANES;
+
+        float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
+        float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
+        float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
+        float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
+        float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
+        float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
+        float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
+        float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
+
+        float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02);
+        float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02);
+
+        float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04);
+        float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04);
+
+        float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06);
+        float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06);
+
+        float32x4_t _out00 = vaddq_f32(bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f)));
+        float32x4_t _out02 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f));
+        float32x4_t _out04 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f));
+
+        float32x4_t _out01 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f));
+        float32x4_t _out03 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f));
+        float32x4_t _out05 = vaddq_f32(bias0, vaddq_f32(vaddq_f32(_tmp07, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f)));
+
+        if (ifMinMaxAct)
+        {
+            float32x4_t vmin = vdupq_n_f32(minval), vmax = vdupq_n_f32(maxval);
+            _out00 = vminq_f32(vmaxq_f32(_out00, vmin), vmax);
+            _out01 = vminq_f32(vmaxq_f32(_out01, vmin), vmax);
+            _out02 = vminq_f32(vmaxq_f32(_out02, vmin), vmax);
+            _out03 = vminq_f32(vmaxq_f32(_out03, vmin), vmax);
+            _out04 = vminq_f32(vmaxq_f32(_out04, vmin), vmax);
+            _out05 = vminq_f32(vmaxq_f32(_out05, vmin), vmax);
+        }
+
+        vst1q_f32(output0,                     _out00);
+        vst1q_f32(output0 +     FAST_VEC_NLANES, _out01);
+        vst1q_f32(output0 + 2 * FAST_VEC_NLANES, _out02);
+        vst1q_f32(output0 + 3 * FAST_VEC_NLANES, _out03);
+        vst1q_f32(output0 + 4 * FAST_VEC_NLANES, _out04);
+        vst1q_f32(output0 + 5 * FAST_VEC_NLANES, _out05);
+    }
+}
+
+void initWinograd63(Ptr<FastConv2d>& conv, float* srcWeight, int K, int C)
+{
+    static const float ktm[8][3] = {
+            {1.0f,      0.0f,      0.0f},
+            {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+            {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+            {1.0f / 90, 1.0f / 45, 2.0f / 45},
+            {1.0f / 90, -1.0f / 45, 2.0f / 45},
+            {1.0f / 45, 1.0f / 90, 1.0f / 180},
+            {1.0f / 45, -1.0f / 90, 1.0f / 180},
+            {0.0f, 0.0f, 1.0f}
+    };
+
+    int K_aligned = ((K + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int C_aligned = ((C + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    const int winoSize = C * WINO_AREA;
+    const int kArea = WINO_KSIZE * WINO_KSIZE;
+    const int kSize = C * kArea;
+
+    // Allocate memory for winograd.
+    int nweights = K_aligned * C_aligned * WINO_AREA;
+
+    conv->weightsWino63Buf.reserve(nweights);
+    float* weightsWino63Ptr = conv->weightsWino63Buf.data();
+    memset(weightsWino63Ptr, 0, nweights*sizeof(weightsWino63Ptr[0]));
+    float* wptrWino = weightsWino63Ptr;
+
+    AutoBuffer<float> kernelTm0_;
+    kernelTm0_.allocate(WINO_AREA * K * C);
+    float *kernelTm = kernelTm0_.data();
+    memset(kernelTm, 0, WINO_AREA * K * C*sizeof(kernelTm[0]));
+
+    // Step1 Transform : size [K, C, 8, 8]
+    parallel_for_(Range(0, K), [&](const Range& r0)
+    {
+        for (int outc = r0.start; outc < r0.end; outc++)
+        {
+            for (int inc = 0; inc < C; inc++)
+            {
+                float *kernel_tm0 = kernelTm + outc * winoSize + inc * WINO_AREA;
+                const float *kernel0 = srcWeight + outc * kSize + inc * kArea;
+
+                // transform kernel, transposed
+                const float *k0 = kernel0;
+                const float *k1 = kernel0 + 3;
+                const float *k2 = kernel0 + 6;
+
+                // h
+                float tmp[8][3];
+                for (int i = 0; i < 8; i++)
+                {
+                    tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                    tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                    tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+                }
+
+                // v
+                for (int j = 0; j < 8; j++)
+                {
+                    float *tmpp = &tmp[j][0];
+
+                    for (int i = 0; i < 8; i++)
+                    {
+                        kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    }
+                }
+            }
+        }
+    });
+
+    // Step2 Pack 4:
+    // If the number of vector registers >= 32 and outch >= 8,
+    // the size = [8*8, K/4/2, C * 2, 4], otherwise [8*8, K/4, C, 4]
+    for (int r = 0; r < 64; r++)
+    {
+        int outc = 0;
+        float* out0 = wptrWino + r * K_aligned * C_aligned;
+        float* tmp0 = kernelTm + r;
+
+#if CV_NEON_AARCH64
+        // Pack 8
+        for (;outc + 7 < K_aligned; outc += 8)
+        {
+            for (int i = 0; i < 8; i++)
+            {
+                int outc_i = outc + i;
+                int offset8 = outc_i % 8;
+                int outc8 = outc_i / 8;
+                float* out1 = out0 + outc8 * 8 * C_aligned + offset8;
+
+                if (outc_i >= K)
+                {
+                    continue;
+                }
+                else
+                {
+                    float* tmp1 = tmp0 + outc_i * 64 * C;
+
+                    for (int inc = 0; inc < C_aligned; inc++)
+                    {
+                        if (inc >= C)
+                            continue;
+
+                        out1[inc * 8] = tmp1[inc * 64];
+                    }
+                }
+            }
+        }
+#endif
+
+        // Pack 4
+        for (;outc < K_aligned; outc++)
+        {
+            int offset4 = outc % FAST_VEC_NLANES;
+            int outc4 = outc / FAST_VEC_NLANES;
+            float* out1 = out0 + outc4 * 4 * C_aligned + offset4;
+
+            if (outc >= K)
+            {
+                continue;
+            }
+            else
+            {
+                float* tmp1 = tmp0 + outc * 64 * C;
+
+                for (int inc = 0; inc < C_aligned; inc++)
+                {
+                    if (inc >= C)
+                        continue;
+
+                    out1[inc * 4] = tmp1[inc * 64];
+                }
+            }
+
+        }
+    }
+}
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks, float minval,
+        float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K;
+    int H0 = outputShape[2], W0 = outputShape[3];
+
+    // Allocate the right memory size for output.
+    // H and W is integer of 6. the output HxW is integer of 6x6
+    int H_tiles = ((H0 + 5) / 6);
+    int W_tiles = ((W0 + 5) / 6);
+    int tiles = H_tiles * W_tiles;
+
+    int H0_align = H_tiles * 6;
+    int W0_align = W_tiles * 6;
+
+    int Hi_align = H0_align + 2;
+    int Wi_align = W0_align + 2;
+
+    int pad_top = conv->pad_top, pad_bottom = Hi_align - pad_top - Hi;
+    int pad_left = conv->pad_left, pad_right = Wi_align - pad_left - Wi;
+
+    int in_top = pad_top, in_bottom = Hi_align - pad_bottom;
+    int in_left = pad_left, in_right = Wi_align - pad_right;
+
+    CV_Assert(in_bottom >= in_top && in_right >= in_left);
+
+    int C_aligned = ((C + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int K_aligned = ((K + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+
+    int inpPack = 0;
+    int lineNum =0;
+
+    // TODO! tiles > 12
+//#if CV_NEON_AARCH64
+//    if (tiles >= 12)
+//    {
+//        inpPack = 12;
+//        lineNum = tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2;
+//    }
+//    else
+//#endif
+    if (tiles >= 8)
+    {
+        inpPack = 8;
+        lineNum = tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2;
+    }
+    else
+    if (tiles >= 4)
+    {
+        inpPack = 4;
+        lineNum = tiles / 4 + (tiles % 4) / 2 + tiles % 2;
+    }
+    else if (tiles >= 2)
+    {
+        inpPack = 2;
+        lineNum = tiles / 2 + tiles % 2;
+    }
+    else // tiles >= 1
+    {
+        inpPack = 1;
+        lineNum = tiles;
+    }
+    CV_Assert(lineNum > 0 && inpPack > 0);
+    std::vector<int> ofstab0_(tiles * 2, 0);
+    int* ofstab0 = ofstab0_.data(); // [line Number, input pack]
+
+    int tiles_tmp = tiles;
+    int line_0 = 0;
+
+    int* ofstab_tmp = ofstab0;
+    int big_step = inpPack * C_aligned * lineNum;
+    int line_step = inpPack * C_aligned;
+
+    std::vector<int> linePackList = {12, 8, 4, 2, 1};
+    auto iter = std::find(linePackList.begin(), linePackList.end(), inpPack);
+    CV_Assert(iter != linePackList.end());
+    int ptr = iter - linePackList.begin();
+
+    while (ptr < linePackList.size() && tiles_tmp != 0)
+    {
+        if (tiles_tmp >= linePackList[ptr])
+        {
+            int num = tiles_tmp / linePackList[ptr];
+            for (int i = 0; i < num; i ++)
+            {
+                for (int j = 0; j < linePackList[ptr]; j++)
+                {
+                    ofstab_tmp[0] = line_0;
+                    ofstab_tmp[1] = linePackList[ptr];
+                    ofstab_tmp += 2;
+                }
+                line_0++;
+            }
+            tiles_tmp -= num * linePackList[ptr];
+        }
+        else
+        {
+            ptr++;
+        }
+    }
+
+    const size_t out_planesize = (size_t)H0*W0;
+
+    size_t inputbuf_size = inpPack * C_aligned * lineNum * 64;
+    size_t inputbufCn_size = ntasks * tiles * 4 * 8 * 8;
+
+    size_t outputbuf_size = tiles * K_aligned * 8 * 8;
+    size_t outputCnbuf_size = ntasks * 8 * 8 * 4;
+
+    AutoBuffer<float> inputbuf0_, inputCnbuf0_, outputbuf0_, outputCnbuf0_;
+
+    inputbuf0_.allocate(inputbuf_size);
+    float* inputbuf0 = alignPtr(inputbuf0_.data(), (int)(sizeof(float)));
+    memset(inputbuf0, 0, inputbuf_size * sizeof(float ));
+
+    inputCnbuf0_.allocate(inputbufCn_size);
+    float* inputCnbuf0 = inputCnbuf0_.data();
+
+    outputbuf0_.allocate(outputbuf_size);
+    float* outputbuf0 = outputbuf0_.data();
+
+    outputCnbuf0_.allocate(outputCnbuf_size);
+    float* outputCnbuf0 = outputCnbuf0_.data();
+
+    // Input Parallel For
+    float* weight_ptr0 = conv->weightsWino63Buf.data();
+    for (int bn = 0; bn < N; bn++)
+    {
+        float* input_ptr0 = input.ptr<float>() + bn * Hi * Wi * C;
+        float* output_ptr0 = output.ptr<float>() + bn * out_planesize * K;
+
+        // Transform Input
+        //int taskItemLen = C_aligned/4/ntasks;
+        int C_aligned_div4 = C_aligned/4;
+
+        parallel_for_(Range(0, ntasks), [&](const Range& range)
+        {
+            for (int task_i = range.start; task_i < range.end; task_i++)
+            {
+                float *inpCnbuf = inputCnbuf0 + tiles * 256 * task_i;
+                for (int inc4 = task_i; inc4 < C_aligned_div4; inc4 += ntasks)
+                {
+                    for (int cn = 0; cn < 4; cn++)
+                    {
+                        if (cn + inc4 * 4 >= C)
+                        {
+                            // set value to zero
+                            for (int ti = 0; ti < tiles; ti++)
+                            {
+                                float *inpCnbuf_i = inpCnbuf + ti * 4 * 64 + cn;
+
+                                for (int i = 0; i < 8; i++)
+                                {
+                                    inpCnbuf_i[0] = 0.0f;
+                                    inpCnbuf_i[4] = 0.0f;
+                                    inpCnbuf_i[8] = 0.0f;
+                                    inpCnbuf_i[12] = 0.0f;
+
+                                    inpCnbuf_i[16] = 0.0f;
+                                    inpCnbuf_i[20] = 0.0f;
+                                    inpCnbuf_i[24] = 0.0f;
+                                    inpCnbuf_i[28] = 0.0f;
+
+                                    inpCnbuf_i += 4 * 8;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            float *input_ptr = input_ptr0 + (inc4 * 4 + cn) * Hi * Wi;
+
+                            for (int ti = 0; ti < tiles; ti++)
+                            {
+                                float *input_buf0_i = inpCnbuf + ti * 256 + cn;
+
+                                int hi = ti / W_tiles;
+                                int wi = ti % W_tiles;
+
+                                int h_top = hi * 6, h_bottom = hi * 6 + 8;
+                                int w_left = wi * 6, w_right = wi * 6 + 8;
+
+                                for (int h = h_top; h < h_bottom; h++)
+                                {
+                                    if (h >= in_bottom || h < in_top)
+                                    {
+                                        input_buf0_i[0] = 0.0f;
+                                        input_buf0_i[4] = 0.0f;
+                                        input_buf0_i[8] = 0.0f;
+                                        input_buf0_i[12] = 0.0f;
+
+                                        input_buf0_i[16] = 0.0f;
+                                        input_buf0_i[20] = 0.0f;
+                                        input_buf0_i[24] = 0.0f;
+                                        input_buf0_i[28] = 0.0f;
+
+                                        input_buf0_i += 32;
+                                        continue;
+                                    }
+
+                                    for (int w = w_left; w < w_right; w++)
+                                    {
+                                        if (w >= in_right || w < in_left)
+                                        {
+                                            input_buf0_i[0] = 0.0f;
+                                            input_buf0_i += 4;
+                                            continue;
+                                        }
+                                        input_buf0_i[0] = input_ptr[(h - pad_top) * Wi + w - pad_left];
+                                        input_buf0_i += 4;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    // Transfor Compute BdB^T
+                    winograd_trans_input_F63(inpCnbuf, inputbuf0, inc4, tiles, big_step, line_step, ofstab0);
+                }
+            }
+        });
+
+        // Matrix multiplication 8 channel
+        int K_div8 = 0;
+
+#if CV_NEON_AARCH64
+        K_div8 = K_aligned/8;
+
+        parallel_for_(Range(0, K_div8), [&](const Range &range){
+        for (int outcn = range.start; outcn < range.end; outcn ++)
+        {
+            float* output_tmp = outputbuf0 + tiles * outcn * 8;
+            float* kernel_tmp = weight_ptr0 + outcn * 8 * C_aligned;
+            for (int r = 0; r < 64; r++)
+            {
+                float* input_tm = inputbuf0 + r * big_step;
+                float* output0_tm = output_tmp + tiles * K_aligned * r;
+                float* output1_tm = output0_tm + tiles * 4;
+                float* kernel_tm_i = kernel_tmp + r * C_aligned * K_aligned;
+
+                int ti = 0;
+                for (; ti + 7 < tiles; ti += 8)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 32 registers. FMA/load ratio = 64/16
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                    float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+                    float32x4_t r20 = r00, r21 = r00, r22 = r00, r23 = r00;
+                    float32x4_t r24 = r00, r25 = r00, r26 = r00, r27 = r00;
+                    float32x4_t r28 = r00, r29 = r00, r30 = r00, r31 = r00;
+
+                    for(;nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 16, k01 += 16;
+
+                        r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                        r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                        r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                        r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                        r04 = vld1q_f32(r0), r05 = vld1q_f32(r0+4), r06 = vld1q_f32(r0+8), r07 = vld1q_f32(r0+12);
+                        r0 += 16;
+
+                        r20 = vfmaq_laneq_f32(r20, r08, r04, 0);
+                        r21 = vfmaq_laneq_f32(r21, r08, r05, 0);
+                        r22 = vfmaq_laneq_f32(r22, r08, r06, 0);
+                        r23 = vfmaq_laneq_f32(r23, r08, r07, 0);
+
+                        r24 = vfmaq_laneq_f32(r24, r09, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r09, r01, 0);
+                        r26 = vfmaq_laneq_f32(r26, r09, r02, 0);
+                        r27 = vfmaq_laneq_f32(r27, r09, r03, 0);
+                        r28 = vfmaq_laneq_f32(r28, r09, r04, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r05, 0);
+                        r30 = vfmaq_laneq_f32(r30, r09, r06, 0);
+                        r31 = vfmaq_laneq_f32(r31, r09, r07, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r16 = vfmaq_laneq_f32(r16, r10, r00, 1);
+                        r17 = vfmaq_laneq_f32(r17, r10, r01, 1);
+                        r18 = vfmaq_laneq_f32(r18, r10, r02, 1);
+                        r19 = vfmaq_laneq_f32(r19, r10, r03, 1);
+                        r20 = vfmaq_laneq_f32(r20, r10, r04, 1);
+                        r21 = vfmaq_laneq_f32(r21, r10, r05, 1);
+                        r22 = vfmaq_laneq_f32(r22, r10, r06, 1);
+                        r23 = vfmaq_laneq_f32(r23, r10, r07, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r11, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r11, r01, 1);
+                        r26 = vfmaq_laneq_f32(r26, r11, r02, 1);
+                        r27 = vfmaq_laneq_f32(r27, r11, r03, 1);
+                        r28 = vfmaq_laneq_f32(r28, r11, r04, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r05, 1);
+                        r30 = vfmaq_laneq_f32(r30, r11, r06, 1);
+                        r31 = vfmaq_laneq_f32(r31, r11, r07, 1);
+
+                        r16 = vfmaq_laneq_f32(r16, r12, r00, 2);
+                        r17 = vfmaq_laneq_f32(r17, r12, r01, 2);
+                        r18 = vfmaq_laneq_f32(r18, r12, r02, 2);
+                        r19 = vfmaq_laneq_f32(r19, r12, r03, 2);
+                        r20 = vfmaq_laneq_f32(r20, r12, r04, 2);
+                        r21 = vfmaq_laneq_f32(r21, r12, r05, 2);
+                        r22 = vfmaq_laneq_f32(r22, r12, r06, 2);
+                        r23 = vfmaq_laneq_f32(r23, r12, r07, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r13, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r13, r01, 2);
+                        r26 = vfmaq_laneq_f32(r26, r13, r02, 2);
+                        r27 = vfmaq_laneq_f32(r27, r13, r03, 2);
+                        r28 = vfmaq_laneq_f32(r28, r13, r04, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r05, 2);
+                        r30 = vfmaq_laneq_f32(r30, r13, r06, 2);
+                        r31 = vfmaq_laneq_f32(r31, r13, r07, 2);
+
+                        r16 = vfmaq_laneq_f32(r16, r14, r00, 3);
+                        r17 = vfmaq_laneq_f32(r17, r14, r01, 3);
+                        r18 = vfmaq_laneq_f32(r18, r14, r02, 3);
+                        r19 = vfmaq_laneq_f32(r19, r14, r03, 3);
+                        r20 = vfmaq_laneq_f32(r20, r14, r04, 3);
+                        r21 = vfmaq_laneq_f32(r21, r14, r05, 3);
+                        r22 = vfmaq_laneq_f32(r22, r14, r06, 3);
+                        r23 = vfmaq_laneq_f32(r23, r14, r07, 3);
+
+                        r24 = vfmaq_laneq_f32(r24, r15, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r15, r01, 3);
+                        r26 = vfmaq_laneq_f32(r26, r15, r02, 3);
+                        r27 = vfmaq_laneq_f32(r27, r15, r03, 3);
+                        r28 = vfmaq_laneq_f32(r28, r15, r04, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r05, 3);
+                        r30 = vfmaq_laneq_f32(r30, r15, r06, 3);
+                        r31 = vfmaq_laneq_f32(r31, r15, r07, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r24), vst1q_f32(output1_tm + 4, r25), vst1q_f32(output1_tm + 8, r26), vst1q_f32(output1_tm + 12, r27);
+                    output1_tm += 16;
+
+                    vst1q_f32(output0_tm, r20), vst1q_f32(output0_tm + 4, r21), vst1q_f32(output0_tm + 8, r22), vst1q_f32(output0_tm + 12, r23);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29), vst1q_f32(output1_tm + 8, r30), vst1q_f32(output1_tm + 12, r31);
+                    output1_tm += 16;
+                }
+
+                for (; ti + 3 < tiles; ti += 4)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 20 registers. FMA/load ratio = 32/12
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00, r25 = r00, r26 = r00, r27 = r00;
+                    float32x4_t r28 = r00, r29 = r00, r30 = r00, r31 = r00;
+
+                    for(; nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 16, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r08, r01, 0);
+                        r26 = vfmaq_laneq_f32(r26, r08, r02, 0);
+                        r27 = vfmaq_laneq_f32(r27, r08, r03, 0);
+
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r01, 0);
+                        r30 = vfmaq_laneq_f32(r30, r09, r02, 0);
+                        r31 = vfmaq_laneq_f32(r31, r09, r03, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r10, r01, 1);
+                        r26 = vfmaq_laneq_f32(r26, r10, r02, 1);
+                        r27 = vfmaq_laneq_f32(r27, r10, r03, 1);
+
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r01, 1);
+                        r30 = vfmaq_laneq_f32(r30, r11, r02, 1);
+                        r31 = vfmaq_laneq_f32(r31, r11, r03, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r12, r01, 2);
+                        r26 = vfmaq_laneq_f32(r26, r12, r02, 2);
+                        r27 = vfmaq_laneq_f32(r27, r12, r03, 2);
+
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r01, 2);
+                        r30 = vfmaq_laneq_f32(r30, r13, r02, 2);
+                        r31 = vfmaq_laneq_f32(r31, r13, r03, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r14, r01, 3);
+                        r26 = vfmaq_laneq_f32(r26, r14, r02, 3);
+                        r27 = vfmaq_laneq_f32(r27, r14, r03, 3);
+
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r01, 3);
+                        r30 = vfmaq_laneq_f32(r30, r15, r02, 3);
+                        r31 = vfmaq_laneq_f32(r31, r15, r03, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24), vst1q_f32(output0_tm + 4, r25), vst1q_f32(output0_tm + 8, r26), vst1q_f32(output0_tm + 12, r27);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29), vst1q_f32(output1_tm + 8, r30), vst1q_f32(output1_tm + 12, r31);
+                    output1_tm += 16;
+                }
+
+                for (; ti + 1 < tiles; ti += 2)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 14 registers. FMA/load ratio = 15/10
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00, r25 = r00;
+                    float32x4_t r28 = r00, r29 = r00;
+
+                    for (; nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 8, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r08, r01, 0);
+
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r01, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r10, r01, 1);
+
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r01, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r12, r01, 2);
+
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r01, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r14, r01, 3);
+
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r01, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24), vst1q_f32(output0_tm + 4, r25);
+                    output0_tm += 8;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29);
+                    output1_tm += 8;
+                }
+
+                for (; ti < tiles; ti ++)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    float32x4_t r00 = vdupq_n_f32(0.0f);
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00;
+                    float32x4_t r28 = r00;
+
+                    for(;nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 4, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24);
+                    output0_tm += 4;
+                    vst1q_f32(output1_tm, r28);
+                    output1_tm += 4;
+                }
+            }
+        }
+        });
+#endif
+
+        // Matrix multiplication, 4 output channel.
+        int Ock_div4 = (K_aligned - K_div8 * 8) / 4;
+        parallel_for_(Range(0, Ock_div4), [&](const Range &range){
+            for (int outcn = range.start; outcn < range.end; outcn++)
+            {
+                float* output_tmp = outputbuf0 + tiles * (outcn + K_div8 * 2)* 4;
+                float* kernel_tmp = weight_ptr0 + (outcn + K_div8 * 2) * 4 * C_aligned;
+
+                for (int r = 0; r < 64; r++)
+                {
+                    float *input_tm = inputbuf0 + r * big_step;
+                    float *output0_tm = output_tmp + tiles * K_aligned * r;
+                    float *kernel_tm_i = kernel_tmp + r * C_aligned * K_aligned;
+
+                    int ti = 0;
+                    for (; ti + 7 < tiles; ti += 8)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+#if CV_NEON_AARCH64
+                        // init 24 registers. FMA/load ratio = 32/12
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+                        float32x4_t r20 = r00, r21 = r00, r22 = r00, r23 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                            r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                            r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                            r04 = vld1q_f32(r0), r05 = vld1q_f32(r0+4), r06 = vld1q_f32(r0+8), r07 = vld1q_f32(r0+12);
+                            r0 += 16;
+
+                            r20 = vfmaq_laneq_f32(r20, r08, r04, 0);
+                            r21 = vfmaq_laneq_f32(r21, r08, r05, 0);
+                            r22 = vfmaq_laneq_f32(r22, r08, r06, 0);
+                            r23 = vfmaq_laneq_f32(r23, r08, r07, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+                            r18 = vfmaq_laneq_f32(r18, r09, r02, 1);
+                            r19 = vfmaq_laneq_f32(r19, r09, r03, 1);
+                            r20 = vfmaq_laneq_f32(r20, r09, r04, 1);
+                            r21 = vfmaq_laneq_f32(r21, r09, r05, 1);
+                            r22 = vfmaq_laneq_f32(r22, r09, r06, 1);
+                            r23 = vfmaq_laneq_f32(r23, r09, r07, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+                            r18 = vfmaq_laneq_f32(r18, r10, r02, 2);
+                            r19 = vfmaq_laneq_f32(r19, r10, r03, 2);
+                            r20 = vfmaq_laneq_f32(r20, r10, r04, 2);
+                            r21 = vfmaq_laneq_f32(r21, r10, r05, 2);
+                            r22 = vfmaq_laneq_f32(r22, r10, r06, 2);
+                            r23 = vfmaq_laneq_f32(r23, r10, r07, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                            r18 = vfmaq_laneq_f32(r18, r11, r02, 3);
+                            r19 = vfmaq_laneq_f32(r19, r11, r03, 3);
+                            r20 = vfmaq_laneq_f32(r20, r11, r04, 3);
+                            r21 = vfmaq_laneq_f32(r21, r11, r05, 3);
+                            r22 = vfmaq_laneq_f32(r22, r11, r06, 3);
+                            r23 = vfmaq_laneq_f32(r23, r11, r07, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                        output0_tm += 16;
+
+                        vst1q_f32(output0_tm, r20), vst1q_f32(output0_tm + 4, r21), vst1q_f32(output0_tm + 8, r22), vst1q_f32(output0_tm + 12, r23);
+                        output0_tm += 16;
+
+#else // ARMv7 16 registers.
+
+                        // init 16 registers. FMA/load ratio = 32/12
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+
+                        for (; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r04 = vld1q_f32(k0), r05 = vld1q_f32(k0+4), r06 = vld1q_f32(k0+8), r07 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r08 = vfmaq_laneq_f32(r08, r04, r00, 0);
+                            r09 = vfmaq_laneq_f32(r09, r04, r01, 0);
+                            r10 = vfmaq_laneq_f32(r10, r04, r02, 0);
+                            r11 = vfmaq_laneq_f32(r11, r04, r03, 0);
+
+                            r08 = vfmaq_laneq_f32(r08, r05, r00, 1);
+                            r09 = vfmaq_laneq_f32(r09, r05, r01, 1);
+                            r10 = vfmaq_laneq_f32(r10, r05, r02, 1);
+                            r11 = vfmaq_laneq_f32(r11, r05, r03, 1);
+
+                            r08 = vfmaq_laneq_f32(r08, r06, r00, 2);
+                            r09 = vfmaq_laneq_f32(r09, r06, r01, 2);
+                            r10 = vfmaq_laneq_f32(r10, r06, r02, 2);
+                            r11 = vfmaq_laneq_f32(r11, r06, r03, 2);
+
+                            r08 = vfmaq_laneq_f32(r08, r07, r00, 3);
+                            r09 = vfmaq_laneq_f32(r09, r07, r01, 3);
+                            r10 = vfmaq_laneq_f32(r10, r07, r02, 3);
+                            r11 = vfmaq_laneq_f32(r11, r07, r03, 3);
+
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r0 += 16;
+
+                            r12 = vfmaq_laneq_f32(r12, r04, r00, 0);
+                            r13 = vfmaq_laneq_f32(r13, r04, r01, 0);
+                            r14 = vfmaq_laneq_f32(r14, r04, r02, 0);
+                            r15 = vfmaq_laneq_f32(r15, r04, r03, 0);
+
+                            r12 = vfmaq_laneq_f32(r12, r05, r00, 1);
+                            r13 = vfmaq_laneq_f32(r13, r05, r01, 1);
+                            r14 = vfmaq_laneq_f32(r14, r05, r02, 1);
+                            r15 = vfmaq_laneq_f32(r15, r05, r03, 1);
+
+                            r12 = vfmaq_laneq_f32(r12, r06, r00, 2);
+                            r13 = vfmaq_laneq_f32(r13, r06, r01, 2);
+                            r14 = vfmaq_laneq_f32(r14, r06, r02, 2);
+                            r15 = vfmaq_laneq_f32(r15, r06, r03, 2);
+
+                            r12 = vfmaq_laneq_f32(r12, r07, r00, 3);
+                            r13 = vfmaq_laneq_f32(r13, r07, r01, 3);
+                            r14 = vfmaq_laneq_f32(r14, r07, r02, 3);
+                            r15 = vfmaq_laneq_f32(r15, r07, r03, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r08), vst1q_f32(output0_tm + 4, r09), vst1q_f32(output0_tm + 8, r10), vst1q_f32(output0_tm + 12, r11);
+                        output0_tm += 16;
+
+                        vst1q_f32(output0_tm, r12), vst1q_f32(output0_tm + 4, r13), vst1q_f32(output0_tm + 8, r14), vst1q_f32(output0_tm + 12, r15);
+                        output0_tm += 16;
+#endif
+                    }
+
+                    for (; ti + 3 < tiles; ti += 4)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+
+                        // init 12 registers. FMA/load ratio = 12/8
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                            r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                            r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+                            r18 = vfmaq_laneq_f32(r18, r09, r02, 1);
+                            r19 = vfmaq_laneq_f32(r19, r09, r03, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+                            r18 = vfmaq_laneq_f32(r18, r10, r02, 2);
+                            r19 = vfmaq_laneq_f32(r19, r10, r03, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                            r18 = vfmaq_laneq_f32(r18, r11, r02, 3);
+                            r19 = vfmaq_laneq_f32(r19, r11, r03, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                        output0_tm += 16;
+                    }
+
+                    for (; ti + 1 < tiles; ti += 2)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+                        // init 8 registers. FMA/load ratio = 8/6
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 8, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17);
+                        output0_tm += 8;
+                    }
+
+                    for (; ti < tiles; ti ++)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+                        // init 8 registers. FMA/load ratio = 8/6
+                        float32x4_t r00 = vdupq_n_f32(0.0f);
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 4, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16);
+                        output0_tm += 4;
+                    }
+                }
+            }
+        });
+
+        int bigStepOut = tiles * K_aligned;
+
+        // Transfor Ouput
+        parallel_for_(Range(0, ntasks), [&](const Range& range)
+        {
+            for (int task_i = range.start; task_i < range.end; task_i++)
+            {
+                float* outputCnbuf = outputCnbuf0 + task_i * 8 * 8 * 4;
+                for (int outCn4 = task_i; outCn4 < K_aligned / 4; outCn4 += ntasks)
+                {
+
+                    int outCn = outCn4 * 4;
+                    float* output_buf = outputbuf0 + outCn * tiles;
+                    float* output_ptr = output_ptr0 + outCn * W0 * H0;
+
+                    for (int ti = 0; ti < tiles; ti++)
+                    {
+                        float* output_buf_i = output_buf + ti * 4;
+                        float* outputCnbuf_i = outputCnbuf;
+                        int hi = ti / W_tiles;
+                        int wi = ti % W_tiles;
+
+                        // construct the output tile.
+                        for (int r = 0; r < 64; r++)
+                        {
+                            memcpy(outputCnbuf_i, output_buf_i, FAST_VEC_NLANES * sizeof(float ));
+                            output_buf_i += bigStepOut;
+                            outputCnbuf_i += FAST_VEC_NLANES;
+                        }
+
+                        winograd_trans_output_F63(outputCnbuf, conv->biasBuf.data() + outCn,
+                                                  minval, maxval, ifMinMaxAct);
+
+                        int wEnd = (wi + 1) * 6 > W0 ? W0 - (wi * 6) : 6;
+                        int hEnd = (hi + 1) * 6 > H0 ? H0 - (hi * 6) : 6;
+
+                        float* output_ptr_i = output_ptr + (hi * W0 + wi) * 6;
+
+                        // write back the output data.
+                        for (int outCni = 0; outCni < FAST_VEC_NLANES; outCni++)
+                        {
+                            float* output_ptr_i_cn = output_ptr_i + outCni * out_planesize;
+                            outputCnbuf_i = outputCnbuf + outCni;
+
+                            if (outCni + outCn < K)
+                            {
+                                for (int i = 0; i < hEnd; i++)
+                                {
+                                    for (int j = 0; j < wEnd; j++)
+                                    {
+                                        output_ptr_i_cn[i * W0 + j] = outputCnbuf_i[(i * 6 + j) * FAST_VEC_NLANES ];
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    if (activ)
+                    {
+                        int outCnEnd = std::min(outCn + FAST_VEC_NLANES, K);
+                        activ->forwardSlice(output_ptr, output_ptr, out_planesize,
+                                                  out_planesize, outCn, outCnEnd);
+                    }
+                }
+            }
+        });
+    }
+
+    return 1;
+}
+
+#else
+
+void initWinograd63(Ptr<FastConv2d>& conv, float* src_weight, int K, int C)
+{
+    conv->ifWinograd63 = false;
+}
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    return 0;
+}
+
+#endif
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index f2c6f1e5a0..dd2ff1db38 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -545,7 +545,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat img = imread(findDataFile("dnn/googlenet_1.png"));
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
-    float l1 = 4e-5, lInf = 2e-3;
+    float l1 = 1e-4, lInf = 2e-3;
     if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
         l1 = 0.4;
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index b747b47660..ddaa51dcbf 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -653,7 +653,7 @@ TEST_P(opencv_face_detector, Accuracy)
                                     0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
                                     0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
                                     0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssertDetections(ref, out, "", 0.5, 1e-5, 2e-4);
+    normAssertDetections(ref, out, "", 0.5, 1e-4, 2e-4);
 }
 
 // False positives bug for large faces: https://github.com/opencv/opencv/issues/15106
@@ -695,7 +695,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 #if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
         CV_TEST_TAG_MEMORY_2GB,  // utilizes ~1Gb, but huge blobs may not be allocated on 32-bit systems due memory fragmentation
 #else
-        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_MEMORY_2GB,
 #endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 844601269b..562014a3b1 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -696,7 +696,7 @@ TEST_P(Test_Int8_nets, GoogLeNet)
     Mat blob = blobFromImages(inpMats, 1.0, Size(224, 224), Scalar(), false);
     Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
 
-    float l1 = 2e-4, lInf = 0.06;
+    float l1 = 2e-4, lInf = 0.07;
     testClassificationNet(net, blob, ref, l1, lInf);
 }
 
@@ -718,7 +718,7 @@ TEST_P(Test_Int8_nets, ResNet50)
     Mat blob = blobFromImage(inp, 1.0, Size(224, 224), Scalar(), false);
     Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
 
-    float l1 = 3e-4, lInf = 0.04;
+    float l1 = 3e-4, lInf = 0.05;
     testClassificationNet(net, blob, ref, l1, lInf);
 }
 
@@ -952,7 +952,7 @@ TEST_P(Test_Int8_nets, EfficientDet)
                                     0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541,
                                     0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661);
 
-    float confThreshold = 0.65, scoreDiff = 0.17, iouDiff = 0.18;
+    float confThreshold = 0.65, scoreDiff = 0.3, iouDiff = 0.18;
     testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1016,7 +1016,7 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
 #if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
         CV_TEST_TAG_MEMORY_2GB,
 #else
-        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_MEMORY_2GB,
 #endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
@@ -1034,7 +1034,7 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
                                     0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
                                     0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
 
-    float confThreshold = 0.8, scoreDiff = 0.024, iouDiff = 0.35;
+    float confThreshold = 0.8, scoreDiff = 0.048, iouDiff = 0.35;
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1084,7 +1084,7 @@ TEST_P(Test_Int8_nets, RFCN)
     Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
                                     0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
 
-    float confThreshold = 0.8, scoreDiff = 0.017, iouDiff = 0.11;
+    float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1114,7 +1114,7 @@ TEST_P(Test_Int8_nets, YoloVoc)
     std::string config_file = "yolo-voc.cfg";
     std::string weights_file = "yolo-voc.weights";
 
-    double scoreDiff = 0.1, iouDiff = 0.3;
+    double scoreDiff = 0.12, iouDiff = 0.3;
     {
     SCOPED_TRACE("batch size 1");
     testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 582d8b0c38..72a8989f6a 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -1336,7 +1336,7 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
     }
     else
     {
-        l1_geometry = 1e-4, lInf_geometry = 3e-3;
+        l1_geometry = 1e-4, lInf_geometry = 4.3e-3;
     }
     normAssert(scores, blobFromNPY(refScoresPath), "scores", l1_scores, lInf_scores);
     normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", l1_geometry, lInf_geometry);

From 32bb4fa9508a8287cf8a0d006a524d7e14d8a79d Mon Sep 17 00:00:00 2001
From: Dan <doprea@gmail.com>
Date: Fri, 1 Jul 2022 13:33:05 +0200
Subject: [PATCH 102/178] Update doc

---
 modules/stitching/include/opencv2/stitching.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modules/stitching/include/opencv2/stitching.hpp b/modules/stitching/include/opencv2/stitching.hpp
index 3d1dbbd044..8a81223124 100644
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -302,6 +302,14 @@ public:
     std::vector<int> component() const { return indices_; }
     std::vector<detail::CameraParams> cameras() const { return cameras_; }
     CV_WRAP double workScale() const { return work_scale_; }
+
+    /** @brief Return the mask of the panorama.
+
+    The mask is a 8U UMat with the values: 0xFF (white) for pixels filled by the input images,
+    0 (black) for unused pixels. It can be used as the mask for inpaint.
+
+    @return The mask.
+     */
     UMat resultMask() const { return result_mask_; }
 
 private:

From 0f3de805f4068d20816aea16edd8ee2c49ec9b10 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 2 Jul 2022 00:56:10 +0300
Subject: [PATCH 103/178] build: fix warnings

---
 samples/cpp/imgcodecs_jpeg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/imgcodecs_jpeg.cpp b/samples/cpp/imgcodecs_jpeg.cpp
index b3abc49286..4c22a483d7 100644
--- a/samples/cpp/imgcodecs_jpeg.cpp
+++ b/samples/cpp/imgcodecs_jpeg.cpp
@@ -17,7 +17,7 @@ int main(int /*argc*/, const char** /* argv */ )
     {
         const Point center( img.rows / 2 , img.cols /2 );
 
-        for( int radius = 5; radius < img.rows ; radius += 3.5 )
+        for( int radius = 5; radius < img.rows ; radius += 3 )
         {
             cv::circle( img, center, radius, Scalar(255,0,255) );
         }

From 6360c3bf461d666af192237668eb26b99508513a Mon Sep 17 00:00:00 2001
From: heavyrain-lzy <1528794076@qq.com>
Date: Mon, 4 Jul 2022 04:06:43 +0800
Subject: [PATCH 104/178] Merge pull request #22184 from
 heavyrain-lzy:fixbug_pyrUp

* fix the bug in pyrUp

* add test case for pyrUp

* coding style
---
 modules/imgproc/src/pyramids.cpp      |  2 +-
 modules/imgproc/test/test_pyramid.cpp | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 modules/imgproc/test/test_pyramid.cpp

diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 4f7732925c..0261dd4de3 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -963,7 +963,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
 
                 if (dsize.width > ssize.width*2)
                 {
-                    row[(_dst.cols-1) + x] = row[dx + cn];
+                    row[(_dst.cols-1) * cn + x] = row[dx + cn];
                 }
             }
 
diff --git a/modules/imgproc/test/test_pyramid.cpp b/modules/imgproc/test/test_pyramid.cpp
new file mode 100644
index 0000000000..343d7a2321
--- /dev/null
+++ b/modules/imgproc/test/test_pyramid.cpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(Imgproc_PyrUp, pyrUp_regression_22184)
+{
+    Mat src(100, 100, CV_16UC3, Scalar::all(255));
+    Mat dst(100 * 2 + 1, 100 * 2 + 1, CV_16UC3, Scalar::all(0));
+    pyrUp(src, dst, Size(dst.cols, dst.rows));
+    double min_val = 0;
+    minMaxLoc(dst, &min_val);
+    ASSERT_GT(cvRound(min_val), 0);
+}
+
+}}  // namespace

From 6a4c3b61e6fe8f40e52f510048d712497ce3bb3d Mon Sep 17 00:00:00 2001
From: Povilas Kanapickas <povilas@radix.lt>
Date: Mon, 4 Jul 2022 12:09:33 +0300
Subject: [PATCH 105/178] Improve doc of calcHist overload that accepts vectors

It's not clear how ranges argument should be used in the overload of
calcHist that accepts std::vector. The main overload uses array of
arrays there, while std::vector overload uses a plain array. The code
interprets the vector as a flattened array and rebuilds array of arrays
from it. This is not obvious interpretation, so documentation has been
added to explain the expected usage.
---
 modules/imgproc/include/opencv2/imgproc.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 4cb077f375..fd1a5d9721 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3270,7 +3270,14 @@ CV_EXPORTS void calcHist( const Mat* images, int nimages,
                           const int* histSize, const float** ranges,
                           bool uniform = true, bool accumulate = false );
 
-/** @overload */
+/** @overload
+
+this variant supports only uniform histograms.
+
+ranges argument is either empty vector or a flattened vector of histSize.size()*2 elements
+(histSize.size() element pairs). The first and second elements of each pair specify the lower and
+upper boundaries.
+*/
 CV_EXPORTS_W void calcHist( InputArrayOfArrays images,
                             const std::vector<int>& channels,
                             InputArray mask, OutputArray hist,

From 2920a8e0ec3389b39666854d1eaa407fe541fb52 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@opencv.org.cn>
Date: Tue, 5 Jul 2022 06:45:14 +0800
Subject: [PATCH 106/178] Merge pull request #22185 from
 fengyuentau:ci_job_rename_3.4

Rename jobs for better understanding for branch 3.4

* rename jobs

* remove dots from job names

* correct ubuntu version for linux arm64
---
 .github/workflows/PR-3.4.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index 524c835b00..fcfa924e96 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -6,19 +6,19 @@ on:
       - 3.4
 
 jobs:
-  ARM64:
+  Ubuntu1804-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-ARM64.yaml@main
 
-  U20:
+  Ubuntu2004-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-U20.yaml@main
 
-  W10:
+  Windows10-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-W10.yaml@main
 
   macOS-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-ARM64.yaml@main
 
-  macOS-X64:
+  macOS-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-macOS-x86_64.yaml@main
 
   iOS:

From fb9463c55f0b9828f8429bf9af72f1b8d9987cb9 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@opencv.org.cn>
Date: Tue, 5 Jul 2022 06:46:01 +0800
Subject: [PATCH 107/178] Merge pull request #22186 from
 fengyuentau:ci_job_rename_4.x

Rename jobs for better understanding for branch 4.x

* rename jobs

* remove dots from job names

* correct ubuntu version for linux arm64
---
 .github/workflows/PR-4.x.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 5ea1dfdb9e..44970006f9 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -6,19 +6,19 @@ on:
       - 4.x
 
 jobs:
-  ARM64:
+  Ubuntu1804-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-ARM64.yaml@main
 
-  U20:
+  Ubuntu2004-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20.yaml@main
 
-  W10:
+  Windows10-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
 
   macOS-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-macOS-ARM64.yaml@main
 
-  macOS-X64:
+  macOS-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-macOS-x86_64.yaml@main
 
   iOS:

From a80fcacd90dec4b2a25faf8363811bc6566b80cd Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 6 Jul 2022 00:14:42 +0800
Subject: [PATCH 108/178] Merge pull request #21372 from
 zihaomu:dnn_quantize_per_tensor

Add per_tensor_quantize to int8 quantize

* add per_tensor_quantize to dnn int8 module.

* change api flag from perTensor to perChannel, and recognize quantize type and onnx importer.

* change the default to hpp
---
 .../dnn/include/opencv2/dnn/all_layers.hpp    |  8 ++
 modules/dnn/include/opencv2/dnn/dnn.hpp       |  4 +-
 .../dnn/src/int8layers/convolution_layer.cpp  |  1 +
 .../src/int8layers/fully_connected_layer.cpp  |  2 +
 modules/dnn/src/layers/convolution_layer.cpp  | 34 +++++---
 .../dnn/src/layers/fully_connected_layer.cpp  | 36 +++++---
 modules/dnn/src/layers/layers_common.cpp      | 11 +++
 modules/dnn/src/layers/layers_common.hpp      |  5 +-
 modules/dnn/src/net.cpp                       |  4 +-
 modules/dnn/src/net_impl.hpp                  |  2 +-
 modules/dnn/src/net_quantization.cpp          |  6 +-
 modules/dnn/src/onnx/onnx_importer.cpp        |  4 +
 modules/dnn/test/test_int8_layers.cpp         | 87 ++++++++++++++++---
 13 files changed, 161 insertions(+), 43 deletions(-)

diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index dae8701970..5c86da2be4 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -263,6 +263,10 @@ CV__DNN_INLINE_NS_BEGIN
     public:
         int input_zp, output_zp;
         float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
@@ -368,6 +372,10 @@ CV__DNN_INLINE_NS_BEGIN
     public:
         int input_zp, output_zp;
         float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
         static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
     };
 
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 8bca6c538b..6f03a8c32e 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -621,8 +621,10 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param calibData Calibration data to compute the quantization parameters.
          *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
          *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
+         *  @param perChannel Quantization granularity of quantized Net. The default is true, that means quantize model
+         *  in per-channel way (channel-wise). Set it false to quantize model in per-tensor way (or tensor-wise).
          */
-        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype);
+        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel=true);
 
         /** @brief Returns input scale and zeropoint for a quantized Net.
          *  @param scales output parameter for returning input scales.
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index 45aaa3bc19..dfa58b09fe 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -51,6 +51,7 @@ public:
         input_zp = params.get<int>("input_zeropoint");
         output_zp = params.get<int>("zeropoints");
         output_sc = params.get<float>("scales");
+        per_channel = params.get<bool>("per_channel", true);
 
         if (kernel_size.size() == 2) {
             kernel = Size(kernel_size[1], kernel_size[0]);
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index 0887388b0b..dc759ebdbc 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -26,6 +26,8 @@ public:
         output_zp = params.get<int>("zeropoints");
         output_sc = params.get<float>("scales");
         axis = params.get<int>("axis", 1);
+        per_channel = params.get<bool>("per_channel", true);
+
         if (blobs.size() == 3)
         {
             // blobs[0] - Weights
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 124443399b..c2960d5aeb 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -2226,26 +2226,36 @@ public:
         Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
         Mat biasQuantized(1, numOutput, CV_32S);
         Mat outputMultiplier(1, numOutput, CV_32F);
-        double realMin, realMax, weightsScale;
+        bool perChannel = params.get<bool>("per_channel", true);
 
-        for( int i = 0; i < numOutput; i++ )
+        if (perChannel) // per-Channel quantization.
         {
-            // Quantize weights
-            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
-            realMin = std::min(realMin, 0.0);
-            realMax = std::max(realMax, 0.0);
-            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
-            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+            for (int i = 0; i < numOutput; i++)
+            {
+                double weightsScale = getWeightScale(weightsMat.row(i));
 
-            // Quantize biases
+                weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+                float biasScale = inputScale * weightsScale;
+                biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+                outputMultiplier.at<float>(i) = biasScale / outputScale;
+            }
+        }
+        else // per-Tensor quantization.
+        {
+            double weightsScale = getWeightScale(weightsMat);
+
+            weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
             float biasScale = inputScale * weightsScale;
-            biasQuantized.at<int>(i) = (int)std::round(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
 
-            // Store multiplier
-            outputMultiplier.at<float>(i) = biasScale / outputScale;
+            for (int i = 0; i < numOutput; i++)
+            {
+                biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+                outputMultiplier.at<float>(i) = biasScale / outputScale;
+            }
         }
 
         params.blobs.clear();
+        params.set("per_channel", perChannel);
         params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
         params.blobs.push_back(biasQuantized);
         params.blobs.push_back(outputMultiplier);
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index e9632e20be..509f6cc177 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -619,26 +619,36 @@ public:
         Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
         Mat biasQuantized(1, numOutput, CV_32S);
         Mat outputMultiplier(1, numOutput, CV_32F);
+        bool perChannel = params.get<bool>("per_channel", true);
 
-        double realMin, realMax, weightsScale;
-        for( int i = 0; i < numOutput; i++ )
+        if (perChannel) // per-Channel quantization.
         {
-            // Quantize weights
-            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
-            realMin = std::min(realMin, 0.0);
-            realMax = std::max(realMax, 0.0);
-            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
-            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
-
-            // Quantize biases
+            for (int i = 0; i < numOutput; i++)
+            {
+                double weightsScale = getWeightScale(weightsMat.row(i));
+
+                weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+                float biasScale = inputScale * weightsScale;
+                biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+                outputMultiplier.at<float>(i) = biasScale / outputScale;
+            }
+        }
+        else // per-Tensor quantization.
+        {
+            double weightsScale = getWeightScale(weightsMat);
+
+            weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
             float biasScale = inputScale * weightsScale;
-            biasQuantized.at<int>(i) = (int)std::round(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
 
-            // Store multiplier
-            outputMultiplier.at<float>(i) = biasScale / outputScale;
+            for (int i = 0; i < numOutput; i++)
+            {
+                biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+                outputMultiplier.at<float>(i) = biasScale / outputScale;
+            }
         }
 
         params.blobs.clear();
+        params.set("per_channel", perChannel);
         params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
         params.blobs.push_back(biasQuantized);
         params.blobs.push_back(outputMultiplier);
diff --git a/modules/dnn/src/layers/layers_common.cpp b/modules/dnn/src/layers/layers_common.cpp
index 78f91a69d6..445a89ff98 100644
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -250,5 +250,16 @@ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>&
     }
 }
 
+double getWeightScale(const Mat& weightsMat)
+{
+    double realMin, realMax;
+
+    cv::minMaxIdx(weightsMat, &realMin, &realMax);
+    realMin = std::min(realMin, 0.0);
+    realMax = std::max(realMax, 0.0);
+
+    return (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+}
+
 }
 }
diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp
index 81e7bdd11c..85f442c78e 100644
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -70,9 +70,12 @@ void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>
                           const std::vector<size_t>& stride, const String &padMode,
                           const std::vector<size_t>& dilation, std::vector<int>& out);
 
- void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
                           const std::vector<size_t>& strides, const String &padMode,
                           std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
+
+// Used in quantized model. It will return the (Max_element - Min_element)/127.
+double getWeightScale(const Mat& weightsMat);
 }
 }
 
diff --git a/modules/dnn/src/net.cpp b/modules/dnn/src/net.cpp
index 901101b1e0..33f22744b8 100644
--- a/modules/dnn/src/net.cpp
+++ b/modules/dnn/src/net.cpp
@@ -115,12 +115,12 @@ void Net::forward(std::vector<std::vector<Mat>>& outputBlobs,
 }
 
 // FIXIT drop from inference API
-Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype)
+Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
 {
     CV_TRACE_FUNCTION();
     CV_Assert(impl);
     CV_Assert(!empty());
-    return impl->quantize(calibData, inputsDtype, outputsDtype);
+    return impl->quantize(calibData, inputsDtype, outputsDtype, perChannel);
 }
 
 // FIXIT drop from inference API
diff --git a/modules/dnn/src/net_impl.hpp b/modules/dnn/src/net_impl.hpp
index 9dc96fe82d..5f0563d3c3 100644
--- a/modules/dnn/src/net_impl.hpp
+++ b/modules/dnn/src/net_impl.hpp
@@ -258,7 +258,7 @@ struct Net::Impl : public detail::NetImplBase
     void dumpNetworkToFile() const;
 
     // FIXIT drop from inference API
-    Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype) /*const*/;
+    Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel) /*const*/;
     void getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
     void getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
 
diff --git a/modules/dnn/src/net_quantization.cpp b/modules/dnn/src/net_quantization.cpp
index ef1857a8e2..8316687412 100644
--- a/modules/dnn/src/net_quantization.cpp
+++ b/modules/dnn/src/net_quantization.cpp
@@ -33,7 +33,7 @@ void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vect
 }
 
 // FIXIT drop from inference API
-Net Net::Impl::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype)
+Net Net::Impl::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
 {
     // Net can be quantized only once.
     if (netWasQuantized)
@@ -192,6 +192,10 @@ Net Net::Impl::quantize(InputArrayOfArrays calibData, int inputsDtype, int outpu
         inp_out_sc[1] = scales[ld.id];
         inp_out_zp[1] = zeropoints[ld.id];
 
+        // Set the quantization type, per-tensor quantize or per-channel quantize.
+        // Especially for Convolution layer and Fully connection layer.
+        ld.params.set("per_channel", perChannel);
+
         // Quantize layer
         Ptr<Layer> layer = ld.layerInstance;
         if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 15e6919f19..ebbda98d51 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -3401,6 +3401,7 @@ void ONNXImporter::parseQConv(LayerParams& layerParams, const opencv_onnx::NodeP
     int outCn = weights.size[0];
     Mat w_scale = getBlob(node_proto, 4);
     CV_Assert(w_scale.total() == 1 || w_scale.total() == outCn);
+    bool per_channel = w_scale.total() == outCn ? true : false;
     Mat wt_sc = (w_scale.total() == outCn) ? w_scale : Mat(1, outCn, CV_32F, Scalar(w_scale.at<float>(0)));
 
     Mat out_sc = getBlob(node_proto, 6);
@@ -3419,6 +3420,7 @@ void ONNXImporter::parseQConv(LayerParams& layerParams, const opencv_onnx::NodeP
     layerParams.set("num_output", outCn);
     layerParams.set("input_zeropoint", inp_zp.at<int8_t>(0));
     layerParams.set("input_scale",inp_sc.at<float>(0));
+    layerParams.set("per_channel", per_channel);
     layerParams.blobs.push_back(weights);
     layerParams.blobs.push_back(biasFused);
     layerParams.blobs.push_back(outputMultiplier);
@@ -3444,6 +3446,7 @@ void ONNXImporter::parseQMatMul(LayerParams& layerParams, const opencv_onnx::Nod
 
     Mat w_scale = getBlob(node_proto, 4);
     CV_Assert(w_scale.total() == 1 || w_scale.total() == outCn);
+    bool per_channel = w_scale.total() == outCn ? true : false;
     Mat wt_sc = (w_scale.total() == outCn) ? w_scale : Mat(1, outCn, CV_32F, Scalar(w_scale.at<float>(0)));
     Mat out_sc = getBlob(node_proto, 6);
 
@@ -3460,6 +3463,7 @@ void ONNXImporter::parseQMatMul(LayerParams& layerParams, const opencv_onnx::Nod
     layerParams.set("axis", firstInpDims - secondInpDims + 1);
     layerParams.set("input_scale", inp_sc.at<float>(0));
     layerParams.set("input_zeropoint", inp_zp.at<int8_t>(0));
+    layerParams.set("per_channel", per_channel);
 
     layerParams.blobs.push_back(weights);
     layerParams.blobs.push_back(bias);
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 562014a3b1..ab00bfba76 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -29,7 +29,7 @@ class Test_Int8_layers : public DNNTestLayer
 public:
     void testLayer(const String& basename, const String& importer, double l1, double lInf,
                    int numInps = 1, int numOuts = 1, bool useCaffeModel = false,
-                   bool useCommonInputBlob = true, bool hasText = false)
+                   bool useCommonInputBlob = true, bool hasText = false, bool perChannel = true)
     {
         CV_Assert_N(numInps >= 1, numInps <= 10, numOuts >= 1, numOuts <= 10);
         std::vector<Mat> inps(numInps), inps_int8(numInps);
@@ -75,7 +75,7 @@ public:
         for (int i = 0; i < numOuts; i++)
             refs[i] = blobFromNPY(outPath + ((numOuts > 1) ? cv::format("_%d.npy", i) : ".npy"));
 
-        qnet = net.quantize(inps, CV_8S, CV_8S);
+        qnet = net.quantize(inps, CV_8S, CV_8S, perChannel);
         qnet.getInputDetails(inputScale, inputZp);
         qnet.getOutputDetails(outputScale, outputZp);
 
@@ -103,6 +103,12 @@ TEST_P(Test_Int8_layers, Convolution1D)
 {
     testLayer("conv1d", "ONNX", 0.00302, 0.00909);
     testLayer("conv1d_bias", "ONNX", 0.00306, 0.00948);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testLayer("conv1d", "ONNX", 0.00302, 0.00909, 1, 1, false, true, false, false);
+        testLayer("conv1d_bias", "ONNX", 0.00319, 0.00948, 1, 1, false, true, false, false);
+    }
 }
 
 TEST_P(Test_Int8_layers, Convolution2D)
@@ -130,6 +136,18 @@ TEST_P(Test_Int8_layers, Convolution2D)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_TIMVX);
     testLayer("layer_convolution", "Caffe", 0.0174, 0.0758, 1, 1, true);
     testLayer("depthwise_conv2d", "TensorFlow", 0.0388, 0.169);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testLayer("single_conv", "TensorFlow", 0.00413, 0.02301, 1, 1, false, true, false, false);
+        testLayer("atrous_conv2d_valid", "TensorFlow", 0.027967, 0.07808, 1, 1, false, true, false, false);
+        testLayer("atrous_conv2d_same", "TensorFlow", 0.01945, 0.1322, 1, 1, false, true, false, false);
+        testLayer("keras_atrous_conv2d_same", "TensorFlow", 0.005677, 0.03327, 1, 1, false, true, false, false);
+        testLayer("convolution", "ONNX", 0.00538, 0.01517, 1, 1, false, true, false, false);
+        testLayer("two_convolution", "ONNX", 0.00295, 0.00926, 1, 1, false, true, false, false);
+        testLayer("layer_convolution", "Caffe", 0.0175, 0.0759, 1, 1, true, true, false, false);
+        testLayer("depthwise_conv2d", "TensorFlow", 0.041847, 0.18744, 1, 1, false, true, false, false);
+    }
 }
 
 TEST_P(Test_Int8_layers, Convolution3D)
@@ -144,6 +162,13 @@ TEST_P(Test_Int8_layers, Flatten)
     testLayer("flatten", "TensorFlow", 0.0036, 0.0069, 1, 1, false, true, true);
     testLayer("unfused_flatten", "TensorFlow", 0.0014, 0.0028);
     testLayer("unfused_flatten_unknown_batch", "TensorFlow", 0.0043, 0.0051);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testLayer("conv3d", "TensorFlow", 0.00734, 0.02434, 1, 1, false, true, false, false);
+        testLayer("conv3d", "ONNX", 0.00377, 0.01362, 1, 1, false, true, false, false);
+        testLayer("conv3d_bias", "ONNX", 0.00201, 0.0039, 1, 1, false, true, false, false);
+    }
 }
 
 TEST_P(Test_Int8_layers, Padding)
@@ -349,6 +374,20 @@ TEST_P(Test_Int8_layers, InnerProduct)
         testLayer("constant", "ONNX", 0.00021, 0.0006);
 
     testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testLayer("layer_inner_product", "Caffe", 0.0055, 0.02, 1, 1, true, true, false, false);
+        testLayer("matmul", "TensorFlow", 0.0075, 0.019, 1, 1, false, true, false, false);
+        testLayer("nhwc_transpose_reshape_matmul", "TensorFlow", 0.0009, 0.0091, 1, 1, false, true, false, false);
+        testLayer("nhwc_reshape_matmul", "TensorFlow", 0.037, 0.071, 1, 1, false, true, false, false);
+        testLayer("matmul_layout", "TensorFlow", 0.035, 0.095, 1, 1, false, true, false, false);
+        testLayer("tf2_dense", "TensorFlow", 0, 0, 1, 1, false, true, false, false);
+        testLayer("matmul_add", "ONNX", 0.041, 0.082, 1, 1, false, true, false, false);
+        testLayer("linear", "ONNX", 0.0022, 0.004, 1, 1, false, true, false, false);
+        testLayer("constant", "ONNX", 0.00038, 0.0012, 1, 1, false, true, false, false);
+        testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016, 1, 1, false, true, false, false);
+    }
 }
 
 TEST_P(Test_Int8_layers, Reshape)
@@ -465,9 +504,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_layers, dnnBackendsAndTargetsInt8());
 class Test_Int8_nets : public DNNTestLayer
 {
 public:
-    void testClassificationNet(Net baseNet, const Mat& blob, const Mat& ref, double l1, double lInf)
+    void testClassificationNet(Net baseNet, const Mat& blob, const Mat& ref, double l1, double lInf, bool perChannel = true)
     {
-        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
         qnet.setPreferableBackend(backend);
         qnet.setPreferableTarget(target);
 
@@ -477,9 +516,9 @@ public:
     }
 
     void testDetectionNet(Net baseNet, const Mat& blob, const Mat& ref,
-                          double confThreshold, double scoreDiff, double iouDiff)
+                          double confThreshold, double scoreDiff, double iouDiff, bool perChannel = true)
     {
-        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
         qnet.setPreferableBackend(backend);
         qnet.setPreferableTarget(target);
 
@@ -488,14 +527,14 @@ public:
         normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
     }
 
-    void testFaster(Net baseNet, const Mat& ref, double confThreshold, double scoreDiff, double iouDiff)
+    void testFaster(Net baseNet, const Mat& ref, double confThreshold, double scoreDiff, double iouDiff, bool perChannel = true)
     {
         Mat inp = imread(_tf("dog416.png"));
         resize(inp, inp, Size(800, 600));
         Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
         Mat imInfo = (Mat_<float>(1, 3) << inp.rows, inp.cols, 1.6f);
 
-        Net qnet = baseNet.quantize(std::vector<Mat>{blob, imInfo}, CV_32F, CV_32F);
+        Net qnet = baseNet.quantize(std::vector<Mat>{blob, imInfo}, CV_32F, CV_32F, perChannel);
         qnet.setPreferableBackend(backend);
         qnet.setPreferableTarget(target);
 
@@ -505,7 +544,7 @@ public:
         normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
     }
 
-    void testONNXNet(const String& basename, double l1, double lInf, bool useSoftmax = false)
+    void testONNXNet(const String& basename, double l1, double lInf, bool useSoftmax = false, bool perChannel = true)
     {
         String onnxmodel = findDataFile("dnn/onnx/models/" + basename + ".onnx", false);
 
@@ -515,7 +554,7 @@ public:
         baseNet.setPreferableBackend(backend);
         baseNet.setPreferableTarget(target);
 
-        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
         qnet.setInput(blob);
         Mat out = qnet.forward();
 
@@ -538,7 +577,7 @@ public:
 
     void testDarknetModel(const std::string& cfg, const std::string& weights,
                           const cv::Mat& ref, double scoreDiff, double iouDiff,
-                          float confThreshold = 0.24, float nmsThreshold = 0.4)
+                          float confThreshold = 0.24, float nmsThreshold = 0.4, bool perChannel = true)
     {
         CV_Assert(ref.cols == 7);
         std::vector<std::vector<int> > refClassIds;
@@ -578,7 +617,7 @@ public:
         Mat inp = blobFromImages(samples, 1.0/255, Size(416, 416), Scalar(), true, false);
 
         Net baseNet = readNetFromDarknet(findDataFile("dnn/" + cfg), findDataFile("dnn/" + weights, false));
-        Net qnet = baseNet.quantize(inp, CV_32F, CV_32F);
+        Net qnet = baseNet.quantize(inp, CV_32F, CV_32F, perChannel);
         qnet.setPreferableBackend(backend);
         qnet.setPreferableTarget(target);
         qnet.setInput(inp);
@@ -720,6 +759,11 @@ TEST_P(Test_Int8_nets, ResNet50)
 
     float l1 = 3e-4, lInf = 0.05;
     testClassificationNet(net, blob, ref, l1, lInf);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testClassificationNet(net, blob, ref, l1, lInf, false);
+    }
 }
 
 TEST_P(Test_Int8_nets, DenseNet121)
@@ -954,6 +998,11 @@ TEST_P(Test_Int8_nets, EfficientDet)
 
     float confThreshold = 0.65, scoreDiff = 0.3, iouDiff = 0.18;
     testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+
+    {
+        SCOPED_TRACE("Per-tensor quantize");
+        testDetectionNet(net, blob, ref, 0.85, scoreDiff, iouDiff, false);
+    }
 }
 
 TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
@@ -1147,11 +1196,20 @@ TEST_P(Test_Int8_nets, TinyYoloVoc)
     {
     SCOPED_TRACE("batch size 1");
     testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);
+        {
+            SCOPED_TRACE("Per-tensor quantize");
+            testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), 0.1, 0.2, 0.24, 0.6, false);
+        }
     }
 
     {
     SCOPED_TRACE("batch size 2");
     testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+
+        {
+            SCOPED_TRACE("Per-tensor quantize");
+            testDarknetModel(config_file, weights_file, ref, 0.1, 0.2, 0.24, 0.6, false);
+        }
     }
 }
 
@@ -1269,6 +1327,11 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
     {
         SCOPED_TRACE("batch size 1");
         testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+
+        {
+            SCOPED_TRACE("Per-tensor quantize");
+            testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, 0.16, 0.7, 0.4, false);
+        }
     }
 
     throw SkipTestException("batch2: bad accuracy on second image");

From a630ad73cb5dde7f8c69e86094bd4a1070ff0249 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Wed, 6 Jul 2022 23:31:31 +0900
Subject: [PATCH 109/178] suppress warning on GCC 7 and later

---
 3rdparty/openexr/CMakeLists.txt   | 4 ++++
 3rdparty/protobuf/CMakeLists.txt  | 4 ++++
 modules/calib3d/CMakeLists.txt    | 4 ++++
 modules/core/CMakeLists.txt       | 4 ++++
 modules/dnn/CMakeLists.txt        | 4 ++++
 modules/features2d/CMakeLists.txt | 4 ++++
 modules/flann/CMakeLists.txt      | 4 ++++
 modules/imgproc/CMakeLists.txt    | 4 ++++
 modules/ml/CMakeLists.txt         | 4 ++++
 modules/objdetect/CMakeLists.txt  | 4 ++++
 modules/shape/CMakeLists.txt      | 4 ++++
 modules/stitching/CMakeLists.txt  | 4 ++++
 modules/ts/CMakeLists.txt         | 4 ++++
 samples/cpp/CMakeLists.txt        | 4 ++++
 14 files changed, 56 insertions(+)

diff --git a/3rdparty/openexr/CMakeLists.txt b/3rdparty/openexr/CMakeLists.txt
index 88f60b23c0..1fbfa4f857 100644
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@@ -110,6 +110,10 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -W
                                      -Wreorder
                                      -Wunused-result
 )
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
 endif()
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index f249d2dcc3..f8372dc1ad 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -25,6 +25,10 @@ else()
                                        -Warray-bounds  # GCC 9+
   )
 endif()
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 if(CV_ICC)
   ocv_warnings_disable(CMAKE_CXX_FLAGS
       -wd265 -wd858 -wd873 -wd2196
diff --git a/modules/calib3d/CMakeLists.txt b/modules/calib3d/CMakeLists.txt
index a38fdf18d8..1c31ad1903 100644
--- a/modules/calib3d/CMakeLists.txt
+++ b/modules/calib3d/CMakeLists.txt
@@ -3,6 +3,10 @@ set(debug_modules "")
 if(DEBUG_opencv_calib3d)
   list(APPEND debug_modules opencv_highgui)
 endif()
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 ocv_define_module(calib3d opencv_imgproc opencv_features2d ${debug_modules}
     WRAP java python js
 )
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index beba9f804e..568a8afef1 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -38,6 +38,10 @@ endif()
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
 endif()
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 
 if(CV_TRACE AND HAVE_ITT)
   add_definitions(-DOPENCV_WITH_ITT=1)
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 4019356392..680be360ba 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -35,6 +35,10 @@ else()
                                        -Wunused-parameter -Wsign-compare
   )
 endif()
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 endif()
diff --git a/modules/features2d/CMakeLists.txt b/modules/features2d/CMakeLists.txt
index 1d29320a14..28af58565b 100644
--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@@ -1,6 +1,10 @@
 set(the_description "2D Features Framework")
 
 ocv_add_dispatched_file(sift SSE4_1 AVX2 AVX512_SKX)
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 
 set(debug_modules "")
 if(DEBUG_opencv_features2d)
diff --git a/modules/flann/CMakeLists.txt b/modules/flann/CMakeLists.txt
index 78e041129b..5935f679d4 100644
--- a/modules/flann/CMakeLists.txt
+++ b/modules/flann/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Clustering and Search in Multi-Dimensional Spaces")
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 ocv_define_module(flann opencv_core WRAP python)
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 3b45482481..5787ec502c 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -12,6 +12,10 @@ ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(undistort SSE2 AVX2)
 ocv_define_module(imgproc opencv_core WRAP java python js)
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 
 ocv_check_environment_variables(OPENCV_IPP_GAUSSIAN_BLUR)
 option(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OFF)
diff --git a/modules/ml/CMakeLists.txt b/modules/ml/CMakeLists.txt
index 1b64cc4f17..8b76e0c8ad 100644
--- a/modules/ml/CMakeLists.txt
+++ b/modules/ml/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Machine Learning")
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 ocv_define_module(ml opencv_core WRAP java python)
diff --git a/modules/objdetect/CMakeLists.txt b/modules/objdetect/CMakeLists.txt
index 414e578099..34b77eab6b 100644
--- a/modules/objdetect/CMakeLists.txt
+++ b/modules/objdetect/CMakeLists.txt
@@ -1,4 +1,8 @@
 set(the_description "Object Detection")
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 ocv_define_module(objdetect opencv_core opencv_imgproc opencv_calib3d WRAP java python js)
 
 if(HAVE_QUIRC)
diff --git a/modules/shape/CMakeLists.txt b/modules/shape/CMakeLists.txt
index 527a0c2f71..209eb11f34 100644
--- a/modules/shape/CMakeLists.txt
+++ b/modules/shape/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Shape descriptors and matchers")
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 ocv_define_module(shape opencv_core opencv_imgproc opencv_video WRAP python)
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 44f35eb59b..7ea34a1608 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -3,6 +3,10 @@ set(the_description "Images stitching")
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
 endif()
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 
 set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
 if(BUILD_SHARED_LIBS AND BUILD_opencv_world AND OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index f95bed0793..fd6e008333 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -15,6 +15,10 @@ if(WINRT)
 endif()
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  # suppress warnings from GCC only on 7.1 and later
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+endif()
 
 ocv_add_module(ts INTERNAL opencv_core opencv_imgproc opencv_imgcodecs opencv_videoio opencv_highgui)
 
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 6ae9586fd4..8cb11c0222 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -41,6 +41,10 @@ foreach(sample_filename ${cpp_samples})
       ocv_warnings_disable(CMAKE_CXX_FLAGS -Winconsistent-missing-override -Wsuggest-override)
     endif()
   endif()
+  if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    # suppress warnings from GCC only on 7.1 and later
+    ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
+  endif()
   set(package "cpp")
   if(sample_filename MATCHES "tutorial_code")
     set(package "tutorial")

From 139c44377032f58849bbab2ae454fcf14e89d762 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Thu, 7 Jul 2022 18:23:08 +0800
Subject: [PATCH 110/178] Merge pull request #22183 from
 zihaomu:fastConv_ARMv7_compatible

DNN: ARMv7 compatible fastConv

* support armv7 on fastConv

* remove whitespace.
---
 .../fast_convolution/fast_convolution.hpp     |   2 +-
 .../fast_convolution.simd.hpp                 | 111 ++++++------
 .../fast_convolution/winograd_3x3s1_f63.cpp   | 171 +++++++++++++-----
 3 files changed, 183 insertions(+), 101 deletions(-)

diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
index 30c5ea2009..c993781f5f 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@@ -9,7 +9,7 @@
 
 #ifndef FAST_CONV_PRAM
 #define FAST_CONV_PRAM
-#if CV_NEON && __aarch64__  // 32 registers.
+#if CV_NEON && CV_NEON_AARCH64  // 32 registers.
 #define FAST_CONV_MR 4
 #define FAST_CONV_NR 28
 enum { FAST_VEC_NLANES=4 };
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
index f1542901e7..94b08141e9 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
@@ -158,59 +158,7 @@ void convBlock_NEON(int k, const float *a, const float *b,
                 float *c, int ldc, const float *bias,
                 float minval, float maxval, bool ifActiv)
 {
-#if FAST_CONV_MR == 4 && FAST_CONV_NR == 12
-    {
-        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0;
-        float32x4_t c3 = vdupq_n_f32(bias[1]), c4 = c3, c5 = c3;
-        float32x4_t c6 = vdupq_n_f32(bias[2]), c7 = c6, c8 = c6;
-        float32x4_t c9 = vdupq_n_f32(bias[3]), c10 = c9, c11 = c9;
-
-        float32x4_t a0 = vdupq_n_f32(0.0f);
-        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
-
-        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
-        {
-            a0 = vld1q_f32(a);
-            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
-
-            c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
-            c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
-            c2 = vfmaq_laneq_f32(c2, b2, a0, 0);
-            c3 = vfmaq_laneq_f32(c3, b0, a0, 1);
-            c4 = vfmaq_laneq_f32(c4, b1, a0, 1);
-            c5 = vfmaq_laneq_f32(c5, b2, a0, 1);
-
-            c6 = vfmaq_laneq_f32(c6, b0, a0, 2);
-            c7 = vfmaq_laneq_f32(c7, b1, a0, 2);
-            c8 = vfmaq_laneq_f32(c8, b2, a0, 2);
-
-            c9 = vfmaq_laneq_f32(c9, b0, a0, 3);
-            c10 = vfmaq_laneq_f32(c10, b1, a0, 3);
-            c11 = vfmaq_laneq_f32(c11, b2, a0, 3);
-        }
-
-        if (ifActiv)
-        {
-            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
-            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
-            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
-            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
-            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
-            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
-            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
-            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
-            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
-            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
-            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
-            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
-            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
-        }
-        vst1q_f32(c, c0); vst1q_f32(c+4, c1); vst1q_f32(c+8, c2);
-        vst1q_f32(c + ldc, c3); vst1q_f32(c + ldc + 4, c4); vst1q_f32(c + ldc + 8, c5);
-        vst1q_f32(c + ldc*2, c6); vst1q_f32(c + ldc*2 + 4, c7); vst1q_f32(c + ldc*2 + 8, c8);
-        vst1q_f32(c + ldc*3, c9); vst1q_f32(c + ldc*3 + 4, c10); vst1q_f32(c + ldc*3 + 8, c11);
-    }
-#elif FAST_CONV_MR == 4 && FAST_CONV_NR == 28
+#if CV_NEON_AARCH64 && FAST_CONV_MR == 4 && FAST_CONV_NR == 28  // AARCH64
     {
         float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0, c24 = c0;
         float32x4_t c6 = vdupq_n_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6, c25 = c6;
@@ -220,7 +168,8 @@ void convBlock_NEON(int k, const float *a, const float *b,
         float32x4_t a0 = vdupq_n_f32(0.0f);
         float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
 
-        for (int p = 0; p < k; p++, a += FAST_CONV_MR) {
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR)
+        {
             a0 = vld1q_f32(a);
             b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
             b += 12;
@@ -330,11 +279,63 @@ void convBlock_NEON(int k, const float *a, const float *b,
         vst1q_f32(c + ldc * 3 + 20, c23);
         vst1q_f32(c + ldc * 3 + 24, c27);
     }
+#elif (!defined(CV_NEON_AARCH64) || !CV_NEON_AARCH64) && FAST_CONV_MR == 4 && FAST_CONV_NR == 12 // ARMv7
+    {
+        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0;
+        float32x4_t c3 = vdupq_n_f32(bias[1]), c4 = c3, c5 = c3;
+        float32x4_t c6 = vdupq_n_f32(bias[2]), c7 = c6, c8 = c6;
+        float32x4_t c9 = vdupq_n_f32(bias[3]), c10 = c9, c11 = c9;
+
+        float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+        {
+            a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+            c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+            c1 = vmlaq_lane_f32(c1, b1, a0, 0);
+            c2 = vmlaq_lane_f32(c2, b2, a0, 0);
+
+            c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+            c4 = vmlaq_lane_f32(c4, b1, a0, 1);
+            c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+
+            c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+            c7 = vmlaq_lane_f32(c7, b1, a1, 0);
+            c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+
+            c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+            c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+            c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+        }
+
+        if (ifActiv)
+        {
+            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
+            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
+            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
+            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
+            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
+            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
+            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
+            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
+            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
+            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
+            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
+            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
+            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
+        }
+        vst1q_f32(c, c0); vst1q_f32(c+4, c1); vst1q_f32(c+8, c2);
+        vst1q_f32(c + ldc, c3); vst1q_f32(c + ldc + 4, c4); vst1q_f32(c + ldc + 8, c5);
+        vst1q_f32(c + ldc*2, c6); vst1q_f32(c + ldc*2 + 4, c7); vst1q_f32(c + ldc*2 + 8, c8);
+        vst1q_f32(c + ldc*3, c9); vst1q_f32(c + ldc*3 + 4, c10); vst1q_f32(c + ldc*3 + 8, c11);
+    }
 #else
 #error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_NEON."
 #endif
 }
-
 #endif
 } // namespace opt_NEON
 
diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
index 7a0720f55e..e841889007 100644
--- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
+++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
@@ -192,7 +192,7 @@ static void winograd_trans_input_F63(float* src, float* dst, int Channle_div4, c
         float* input0 = input_buf0 + 4 * tiles * r;
 
         // TODO! support tiles > 12
-//#if (ARMV8)
+//#if CV_NEON_AARCH64
 //        for (; ti + 11 < tiles; ti += 12)
 //        {
 //            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
@@ -617,7 +617,6 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
         float* output_ptr0 = output.ptr<float>() + bn * out_planesize * K;
 
         // Transform Input
-        //int taskItemLen = C_aligned/4/ntasks;
         int C_aligned_div4 = C_aligned/4;
 
         parallel_for_(Range(0, ntasks), [&](const Range& range)
@@ -1093,59 +1092,63 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
 #else // ARMv7 16 registers.
 
                         // init 16 registers. FMA/load ratio = 32/12
-                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
-                        float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
-                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
-                        float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                        float32x2_t q00 = vdup_n_f32(0.0f), q01 = q00, q02 = q00, q03 = q00,
+                                    q04 = q00, q05 = q00, q06 = q00, q07 = q00;
+
+                        float32x4_t r04 = vdupq_n_f32(0.0f), r05 = r04, r06 = r04, r07 = r04;
+                        float32x4_t r08 = r04, r09 = r04, r10 = r04, r11 = r04;
+                        float32x4_t r12 = r04, r13 = r04, r14 = r04, r15 = r04;
 
                         for (; nn > 0; nn--)
                         {
-                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            q00 = vld1_f32(r0), q01 = vld1_f32(r0+2), q02 = vld1_f32(r0+4), q03 = vld1_f32(r0+6);
+                            q04 = vld1_f32(r0+8), q05 = vld1_f32(r0+10), q06 = vld1_f32(r0+12), q07 = vld1_f32(r0+14);
                             r04 = vld1q_f32(k0), r05 = vld1q_f32(k0+4), r06 = vld1q_f32(k0+8), r07 = vld1q_f32(k0+12);
                             r0 += 16, k0 += 16;
 
-                            r08 = vfmaq_laneq_f32(r08, r04, r00, 0);
-                            r09 = vfmaq_laneq_f32(r09, r04, r01, 0);
-                            r10 = vfmaq_laneq_f32(r10, r04, r02, 0);
-                            r11 = vfmaq_laneq_f32(r11, r04, r03, 0);
+                            r08 = vmlaq_lane_f32(r08, r04, q00, 0);
+                            r09 = vmlaq_lane_f32(r09, r04, q02, 0);
+                            r10 = vmlaq_lane_f32(r10, r04, q04, 0);
+                            r11 = vmlaq_lane_f32(r11, r04, q06, 0);
 
-                            r08 = vfmaq_laneq_f32(r08, r05, r00, 1);
-                            r09 = vfmaq_laneq_f32(r09, r05, r01, 1);
-                            r10 = vfmaq_laneq_f32(r10, r05, r02, 1);
-                            r11 = vfmaq_laneq_f32(r11, r05, r03, 1);
+                            r08 = vmlaq_lane_f32(r08, r05, q00, 1);
+                            r09 = vmlaq_lane_f32(r09, r05, q02, 1);
+                            r10 = vmlaq_lane_f32(r10, r05, q04, 1);
+                            r11 = vmlaq_lane_f32(r11, r05, q06, 1);
 
-                            r08 = vfmaq_laneq_f32(r08, r06, r00, 2);
-                            r09 = vfmaq_laneq_f32(r09, r06, r01, 2);
-                            r10 = vfmaq_laneq_f32(r10, r06, r02, 2);
-                            r11 = vfmaq_laneq_f32(r11, r06, r03, 2);
+                            r08 = vmlaq_lane_f32(r08, r06, q01, 0);
+                            r09 = vmlaq_lane_f32(r09, r06, q03, 0);
+                            r10 = vmlaq_lane_f32(r10, r06, q05, 0);
+                            r11 = vmlaq_lane_f32(r11, r06, q07, 0);
 
-                            r08 = vfmaq_laneq_f32(r08, r07, r00, 3);
-                            r09 = vfmaq_laneq_f32(r09, r07, r01, 3);
-                            r10 = vfmaq_laneq_f32(r10, r07, r02, 3);
-                            r11 = vfmaq_laneq_f32(r11, r07, r03, 3);
+                            r08 = vmlaq_lane_f32(r08, r07, q01, 1);
+                            r09 = vmlaq_lane_f32(r09, r07, q03, 1);
+                            r10 = vmlaq_lane_f32(r10, r07, q05, 1);
+                            r11 = vmlaq_lane_f32(r11, r07, q07, 1);
 
-                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            q00 = vld1_f32(r0), q01 = vld1_f32(r0+2), q02 = vld1_f32(r0+4), q03 = vld1_f32(r0+6);
+                            q04 = vld1_f32(r0+8), q05 = vld1_f32(r0+10), q06 = vld1_f32(r0+12), q07 = vld1_f32(r0+14);
                             r0 += 16;
 
-                            r12 = vfmaq_laneq_f32(r12, r04, r00, 0);
-                            r13 = vfmaq_laneq_f32(r13, r04, r01, 0);
-                            r14 = vfmaq_laneq_f32(r14, r04, r02, 0);
-                            r15 = vfmaq_laneq_f32(r15, r04, r03, 0);
-
-                            r12 = vfmaq_laneq_f32(r12, r05, r00, 1);
-                            r13 = vfmaq_laneq_f32(r13, r05, r01, 1);
-                            r14 = vfmaq_laneq_f32(r14, r05, r02, 1);
-                            r15 = vfmaq_laneq_f32(r15, r05, r03, 1);
-
-                            r12 = vfmaq_laneq_f32(r12, r06, r00, 2);
-                            r13 = vfmaq_laneq_f32(r13, r06, r01, 2);
-                            r14 = vfmaq_laneq_f32(r14, r06, r02, 2);
-                            r15 = vfmaq_laneq_f32(r15, r06, r03, 2);
-
-                            r12 = vfmaq_laneq_f32(r12, r07, r00, 3);
-                            r13 = vfmaq_laneq_f32(r13, r07, r01, 3);
-                            r14 = vfmaq_laneq_f32(r14, r07, r02, 3);
-                            r15 = vfmaq_laneq_f32(r15, r07, r03, 3);
+                            r12 = vmlaq_lane_f32(r12, r04, q00, 0);
+                            r13 = vmlaq_lane_f32(r13, r04, q02, 0);
+                            r14 = vmlaq_lane_f32(r14, r04, q04, 0);
+                            r15 = vmlaq_lane_f32(r15, r04, q06, 0);
+
+                            r12 = vmlaq_lane_f32(r12, r05, q00, 1);
+                            r13 = vmlaq_lane_f32(r13, r05, q02, 1);
+                            r14 = vmlaq_lane_f32(r14, r05, q04, 1);
+                            r15 = vmlaq_lane_f32(r15, r05, q06, 1);
+
+                            r12 = vmlaq_lane_f32(r12, r06, q01, 0);
+                            r13 = vmlaq_lane_f32(r13, r06, q03, 0);
+                            r14 = vmlaq_lane_f32(r14, r06, q05, 0);
+                            r15 = vmlaq_lane_f32(r15, r06, q07, 0);
+
+                            r12 = vmlaq_lane_f32(r12, r07, q01, 1);
+                            r13 = vmlaq_lane_f32(r13, r07, q03, 1);
+                            r14 = vmlaq_lane_f32(r14, r07, q05, 1);
+                            r15 = vmlaq_lane_f32(r15, r07, q07, 1);
                         }
 
                         vst1q_f32(output0_tm, r08), vst1q_f32(output0_tm + 4, r09), vst1q_f32(output0_tm + 8, r10), vst1q_f32(output0_tm + 12, r11);
@@ -1162,7 +1165,7 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                         const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
                         const float* k0 = kernel_tm_i;
 
-
+#if CV_NEON_AARCH64
                         // init 12 registers. FMA/load ratio = 12/8
                         float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
                         float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
@@ -1194,7 +1197,42 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                             r18 = vfmaq_laneq_f32(r18, r11, r02, 3);
                             r19 = vfmaq_laneq_f32(r19, r11, r03, 3);
                         }
+#else
+                        // init 12 registers. FMA/load ratio = 12/8
+                        float32x2_t q00 = vdup_n_f32(0.0f), q01 = q00, q02 = q00, q03 = q00,
+                                q04 = q00, q05 = q00, q06 = q00, q07 = q00;
+                        float32x4_t r08 = vdupq_n_f32(0.0f), r09 = r08, r10 = r08, r11 = r08;
+                        float32x4_t r16 = r08, r17 = r08, r18 = r08, r19 = r08;
 
+                        for(; nn > 0; nn--)
+                        {
+                            q00 = vld1_f32(r0), q01 = vld1_f32(r0+2), q02 = vld1_f32(r0+4), q03 = vld1_f32(r0+6);
+                            q04 = vld1_f32(r0+8), q05 = vld1_f32(r0+10), q06 = vld1_f32(r0+12), q07 = vld1_f32(r0+14);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r16 = vmlaq_lane_f32(r16, r08, q00, 0);
+                            r17 = vmlaq_lane_f32(r17, r08, q02, 0);
+                            r18 = vmlaq_lane_f32(r18, r08, q04, 0);
+                            r19 = vmlaq_lane_f32(r19, r08, q06, 0);
+
+                            r16 = vmlaq_lane_f32(r16, r09, q00, 1);
+                            r17 = vmlaq_lane_f32(r17, r09, q02, 1);
+                            r18 = vmlaq_lane_f32(r18, r09, q04, 1);
+                            r19 = vmlaq_lane_f32(r19, r09, q06, 1);
+
+                            r16 = vmlaq_lane_f32(r16, r10, q01, 0);
+                            r17 = vmlaq_lane_f32(r17, r10, q03, 0);
+                            r18 = vmlaq_lane_f32(r18, r10, q05, 0);
+                            r19 = vmlaq_lane_f32(r19, r10, q07, 0);
+
+                            r16 = vmlaq_lane_f32(r16, r11, q01, 1);
+                            r17 = vmlaq_lane_f32(r17, r11, q03, 1);
+                            r18 = vmlaq_lane_f32(r18, r11, q05, 1);
+                            r19 = vmlaq_lane_f32(r19, r11, q07, 1);
+
+                        }
+#endif
                         vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
                         output0_tm += 16;
                     }
@@ -1205,6 +1243,7 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                         const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
                         const float* k0 = kernel_tm_i;
 
+#if CV_NEON_AARCH64
                         // init 8 registers. FMA/load ratio = 8/6
                         float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00;
                         float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
@@ -1228,7 +1267,31 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                             r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
                             r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
                         }
+#else
+                        // init 8 registers. FMA/load ratio = 8/6
+                        float32x2_t q00 = vdup_n_f32(0.0f), q01 = q00, q02 = q00, q03 = q00;
+                        float32x4_t r08 = vdupq_n_f32(0.0f), r09 = r08, r10 = r08, r11 = r08;
+                        float32x4_t r16 = r08, r17 = r08;
 
+                        for(; nn > 0; nn--)
+                        {
+                            q00 = vld1_f32(r0), q01 = vld1_f32(r0+2), q02 = vld1_f32(r0+4), q03 = vld1_f32(r0+6);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 8, k0 += 16;
+
+                            r16 = vmlaq_lane_f32(r16, r08, q00, 0);
+                            r17 = vmlaq_lane_f32(r17, r08, q02, 0);
+
+                            r16 = vmlaq_lane_f32(r16, r09, q00, 1);
+                            r17 = vmlaq_lane_f32(r17, r09, q02, 1);
+
+                            r16 = vmlaq_lane_f32(r16, r10, q01, 0);
+                            r17 = vmlaq_lane_f32(r17, r10, q03, 0);
+
+                            r16 = vmlaq_lane_f32(r16, r11, q01, 1);
+                            r17 = vmlaq_lane_f32(r17, r11, q03, 1);
+                        }
+#endif
                         vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17);
                         output0_tm += 8;
                     }
@@ -1239,7 +1302,8 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                         const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
                         const float* k0 = kernel_tm_i;
 
-                        // init 8 registers. FMA/load ratio = 8/6
+#if CV_NEON_AARCH64
+                        // init 6 registers. FMA/load ratio = 6/5
                         float32x4_t r00 = vdupq_n_f32(0.0f);
                         float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
                         float32x4_t r16 = r00;
@@ -1255,7 +1319,24 @@ int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
                             r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
                             r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
                         }
+#else
+                        // init 6 registers. FMA/load ratio = 6/5
+                        float32x2_t q00 = vdup_n_f32(0.0f), q01 = q00;
+                        float32x4_t r08 = vdupq_n_f32(0.0f), r09 = r08, r10 = r08, r11 = r08;
+                        float32x4_t r16 = r08;
 
+                        for(; nn > 0; nn--)
+                        {
+                            q00 = vld1_f32(r0), q01 = vld1_f32(r0+2);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 4, k0 += 16;
+
+                            r16 = vmlaq_lane_f32(r16, r08, q00, 0);
+                            r16 = vmlaq_lane_f32(r16, r09, q00, 1);
+                            r16 = vmlaq_lane_f32(r16, r10, q01, 0);
+                            r16 = vmlaq_lane_f32(r16, r11, q01, 1);
+                        }
+#endif
                         vst1q_f32(output0_tm, r16);
                         output0_tm += 4;
                     }

From 0a88f84847b8d22582d11ed054cdbe33512b6a25 Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@users.noreply.github.com>
Date: Fri, 8 Jul 2022 00:24:01 +0200
Subject: [PATCH 111/178] Merge pull request #22208 from vrabaud:34_calibinit

Add missing code.

* Add missing code.

This was removed in c7fc563dc0ac2d21ad2bd618e741227c0a4b4f76
The resize is also a side-effect.

* Use shorteer cv::Point2f *=
---
 modules/calib3d/src/calibinit.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 0a8cdd5c6f..1dff8de6e6 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -1232,7 +1232,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
         centers[i] = ci;
         center += ci;
     }
-    center.x *= (1.0f / quad_count);
+    center *= (1.0f / quad_count);
 
     // If we still have more quadrangles than we should,
     // we try to eliminate bad ones based on minimizing the bounding box.
@@ -1298,6 +1298,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
         quad_group[min_box_area_index] = quad_group[quad_count];
         centers[min_box_area_index] = centers[quad_count];
     }
+    quad_group.resize(quad_count);
 
     return quad_count;
 }

From f5622646741f312f1793cc2066fe51c0cdb14457 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Fri, 8 Jul 2022 11:40:25 +0300
Subject: [PATCH 112/178] Add VAAPI into tests & VPL sample

---
 modules/gapi/CMakeLists.txt                   |   4 +
 .../gapi/streaming/onevpl/accel_types.hpp     |   4 +
 .../gapi/samples/onevpl_infer_single_roi.cpp  | 277 +++++++++++++++---
 modules/gapi/src/backends/ie/giebackend.cpp   |  77 +++--
 .../onevpl/accelerators/accel_policy_cpu.cpp  |   2 +-
 .../accelerators/accel_policy_va_api.cpp      |  33 +--
 .../accelerators/accel_policy_va_api.hpp      |   1 -
 .../accelerators/dx11_alloc_resource.cpp      |  21 +-
 .../surface/cpu_frame_adapter.cpp             |   2 +-
 .../onevpl/cfg_param_device_selector.cpp      | 130 +++++++-
 .../onevpl/cfg_param_device_selector.hpp      |   2 +
 .../onevpl/device_selector_interface.cpp      |  13 +
 .../gapi/src/streaming/onevpl/source_priv.cpp |   7 +-
 .../gapi_streaming_vpl_device_selector.cpp    | 135 +++++++++
 14 files changed, 588 insertions(+), 120 deletions(-)

diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 9b97a1b92a..4e45c4432f 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -334,6 +334,7 @@ if(HAVE_GAPI_ONEVPL)
       message(FATAL_ERROR "PkgConfig not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
     endif()
     ocv_target_link_libraries(${the_module} PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
+    ocv_target_link_libraries(opencv_test_gapi PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
   endif()
 endif()
 
@@ -373,6 +374,9 @@ if(TARGET example_gapi_onevpl_infer_single_roi)
   if(HAVE_D3D11 AND HAVE_OPENCL)
     ocv_target_include_directories(example_gapi_onevpl_infer_single_roi SYSTEM PRIVATE ${OPENCL_INCLUDE_DIRS})
   endif()
+  if(PKG_LIBVA_FOUND)
+    ocv_target_link_libraries(example_gapi_onevpl_infer_single_roi PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
+  endif()
 endif()
 
 if(TARGET example_gapi_pipeline_modeling_tool)
diff --git a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
index c53b1b31db..119188d96a 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
@@ -65,6 +65,10 @@ GAPI_EXPORTS Device create_dx11_device(Device::Ptr device_ptr,
                                        const std::string& device_name);
 GAPI_EXPORTS Context create_dx11_context(Context::Ptr ctx_ptr);
 
+GAPI_EXPORTS Device create_vaapi_device(Device::Ptr device_ptr,
+                                       const std::string& device_name,
+                                       int file_description = -1);
+GAPI_EXPORTS Context create_vaapi_context(Context::Ptr ctx_ptr);
 } // namespace onevpl
 } // namespace wip
 } // namespace gapi
diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index a89cd5071b..ed8c74ea8c 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -34,6 +34,17 @@
 #endif // HAVE_DIRECTX
 #endif // HAVE_INF_ENGINE
 
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#include "va/va.h"
+#include "va/va_drm.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // __linux__
+
+
 const std::string about =
     "This is an OpenCV-based version of oneVPLSource decoder example";
 const std::string keys =
@@ -41,14 +52,21 @@ const std::string keys =
     "{ input                        |                                           | Path to the input demultiplexed video file }"
     "{ output                       |                                           | Path to the output RAW video file. Use .avi extension }"
     "{ facem                        | face-detection-adas-0001.xml              | Path to OpenVINO IE face detection model (.xml) }"
-    "{ faced                        | AUTO                                      | Target device for face detection model (e.g. AUTO, GPU, VPU, ...) }"
-    "{ cfg_params                   | <prop name>:<value>;<prop name>:<value>   | Semicolon separated list of oneVPL mfxVariants which is used for configuring source (see `MFXSetConfigFilterProperty` by https://spec.oneapi.io/versions/latest/elements/oneVPL/source/index.html) }"
+    "{ faced                        | GPU                                       | Target device for face detection model (e.g. AUTO, GPU, VPU, ...) }"
+    "{ cfg_params                   |                                           | Semicolon separated list of oneVPL mfxVariants which is used for configuring source (see `MFXSetConfigFilterProperty` by https://spec.oneapi.io/versions/latest/elements/oneVPL/source/index.html) }"
     "{ streaming_queue_capacity     | 1                                         | Streaming executor queue capacity. Calculated automatically if 0 }"
     "{ frames_pool_size             | 0                                         | OneVPL source applies this parameter as preallocated frames pool size}"
     "{ vpp_frames_pool_size         | 0                                         | OneVPL source applies this parameter as preallocated frames pool size for VPP preprocessing results}"
-    "{ roi                          | -1,-1,-1,-1                               | Region of interest (ROI) to use for inference. Identified automatically when not set }";
+    "{ roi                          | -1,-1,-1,-1                               | Region of interest (ROI) to use for inference. Identified automatically when not set }"
+    "{ source_device                | CPU                                       | choose device for decoding }"
+    "{ preproc_device               | CPU                                       | choose device for preprocessing }";
+
 
 namespace {
+bool is_gpu(const std::string &device_name) {
+    return device_name.find("GPU") != std::string::npos;
+}
+
 std::string get_weights_path(const std::string &model_path) {
     const auto EXT_LEN = 4u;
     const auto sz = model_path.size();
@@ -260,6 +278,75 @@ GAPI_OCV_KERNEL(OCVParseSSD, ParseSSD) {
 
 namespace cfg {
 typename cv::gapi::wip::onevpl::CfgParam create_from_string(const std::string &line);
+
+struct flow {
+    flow(bool preproc, bool rctx) :
+        vpl_preproc_enable(preproc),
+        ie_remote_ctx_enable(rctx) {
+    }
+    bool vpl_preproc_enable = false;
+    bool ie_remote_ctx_enable = false;
+};
+
+using support_matrix =
+        std::map <std::string/*source_dev_id*/,
+                  std::map<std::string/*preproc_device_id*/,
+                           std::map <std::string/*rctx device_id*/, std::shared_ptr<flow>>>>;
+support_matrix resolved_conf{{
+                            {"GPU", {{
+                                        {"",    {{ "CPU", std::make_shared<flow>(false, false)},
+                                                 { "GPU", {/* unsupported:
+                                                           * ie GPU preproc isn't available */}}
+                                                }},
+
+                                        {"CPU", {{ "CPU", {/* unsupported: preproc mix */}},
+                                                 { "GPU", {/* unsupported: preproc mix */}}
+                                                }},
+
+                                        {"GPU", {{ "CPU", std::make_shared<flow>(true, false)},
+                                                 { "GPU", std::make_shared<flow>(true, true)}}}
+                                    }}
+                            },
+                            {"CPU", {{
+                                        {"",    {{ "CPU", std::make_shared<flow>(false, false)},
+                                                 { "GPU", std::make_shared<flow>(false, false)}
+                                                }},
+
+                                        {"CPU", {{ "CPU", std::make_shared<flow>(true, false)},
+                                                 { "GPU", std::make_shared<flow>(true, false)}
+                                                }},
+
+                                        {"GPU", {{ "CPU", {/* unsupported: preproc mix */}},
+                                                 { "GPU", {/* unsupported: preproc mix */}}}}
+                                    }}
+                            }
+                        }};
+
+static void print_available_cfg(std::ostream &out,
+                                const std::string &source_device,
+                                const std::string &preproc_device,
+                                const std::string &ie_device_id) {
+    const std::string source_device_cfg_name("--source_device=");
+    const std::string preproc_device_cfg_name("--preproc_device=");
+    const std::string ie_cfg_name("--faced=");
+    out << "unsupported acceleration param combinations:\n"
+                     << source_device_cfg_name << source_device << " "
+                     << preproc_device_cfg_name << preproc_device << " "
+                     << ie_cfg_name << ie_device_id <<
+                     "\n\nSupported matrix:\n\n" << std::endl;
+    for (const auto &s_d : cfg::resolved_conf) {
+        std::string prefix = source_device_cfg_name + s_d.first;
+        for (const auto &p_d : s_d.second) {
+            std::string mid_prefix = prefix + +"\t" + preproc_device_cfg_name +
+                                    (p_d.first.empty() ? "" : p_d.first);
+            for (const auto &i_d : p_d.second) {
+                if (i_d.second) {
+                    std::cerr << mid_prefix << "\t" << ie_cfg_name <<i_d.first << std::endl;
+                }
+            }
+        }
+    }
+}
 }
 
 int main(int argc, char *argv[]) {
@@ -280,6 +367,15 @@ int main(int argc, char *argv[]) {
     const auto source_decode_queue_capacity = cmd.get<uint32_t>("frames_pool_size");
     const auto source_vpp_queue_capacity = cmd.get<uint32_t>("vpp_frames_pool_size");
     const auto device_id = cmd.get<std::string>("faced");
+    const auto source_device = cmd.get<std::string>("source_device");
+    const auto preproc_device = cmd.get<std::string>("preproc_device");
+
+    // validate support matrix
+    std::shared_ptr<cfg::flow> flow_settings = cfg::resolved_conf[source_device][preproc_device][device_id];
+    if (!flow_settings) {
+        cfg::print_available_cfg(std::cerr, source_device, preproc_device, device_id);
+        return -1;
+    }
 
     // check output file extension
     if (!output.empty()) {
@@ -303,6 +399,7 @@ int main(int argc, char *argv[]) {
         return -1;
     }
 
+    // apply VPL source optimization params
     if (source_decode_queue_capacity != 0) {
         source_cfgs.push_back(cv::gapi::wip::onevpl::CfgParam::create_frames_pool_size(source_decode_queue_capacity));
     }
@@ -316,22 +413,57 @@ int main(int argc, char *argv[]) {
         device_id
     };
 
-    // Create device_ptr & context_ptr using graphic API
-    // InferenceEngine requires such device & context to create its own
-    // remote shared context through InferenceEngine::ParamMap in
-    // GAPI InferenceEngine backend to provide interoperability with onevpl::GSource
-    // So GAPI InferenceEngine backend and onevpl::GSource MUST share the same
-    // device and context
-    cv::util::optional<cv::gapi::wip::onevpl::Device> accel_device;
-    cv::util::optional<cv::gapi::wip::onevpl::Context> accel_ctx;
-
-#ifdef HAVE_INF_ENGINE
+    // It is allowed (and highly recommended) to reuse predefined device_ptr & context_ptr objects
+    // received from user application. Current sample demonstrate how to deal with this situation.
+    //
+    // But if you do not need this fine-grained acceleration devices configuration then
+    // just use default constructors for onevpl::GSource, IE and preprocessing module.
+    // But please pay attention that default pipeline construction in this case will be
+    // very inefficient and carries out multiple CPU-GPU memory copies
+    //
+    // If you want to reach max performance and seize copy-free approach for specific
+    // device & context selection then follow the steps below.
+    // The situation is complicated a little bit in comparison with default configuration, thus
+    // let's focusing this:
+    //
+    // - all component-participants (Source, Preprocessing, Inference)
+    // must share the same device & context instances
+    //
+    // - you must wrapping your available device & context instancs into thin
+    // `cv::gapi::wip::Device` & `cv::gapi::wip::Context`.
+    // !!! Please pay attention that both objects are weak wrapper so you must ensure
+    // that device & context would be alived before full pipeline created !!!
+    //
+    // - you should pass such wrappers as constructor arguments for each component in pipeline:
+    //      a) use extended constructor for `onevpl::GSource` for activating predefined device & context
+    //      b) use `cfgContextParams` method of `cv::gapi::ie::Params` to enable `PreprocesingEngine`
+    // for predefined device & context
+    //      c) use `InferenceEngine::ParamMap` to activate remote ctx in Inference Engine for given
+    // device & context
+    //
+    //
+    //// P.S. the current sample supports heterogenous pipeline construction also.
+    //// It is possible to make up mixed device approach.
+    //// Please feel free to explore different configurations!
+
+    cv::util::optional<cv::gapi::wip::onevpl::Device> gpu_accel_device;
+    cv::util::optional<cv::gapi::wip::onevpl::Context> gpu_accel_ctx;
+    cv::gapi::wip::onevpl::Device cpu_accel_device = cv::gapi::wip::onevpl::create_host_device();
+    cv::gapi::wip::onevpl::Context cpu_accel_ctx = cv::gapi::wip::onevpl::create_host_context();
+    // create GPU device if requested
+    if (is_gpu(device_id)
+        || is_gpu(source_device)
+        || is_gpu(preproc_device)) {
 #ifdef HAVE_DIRECTX
 #ifdef HAVE_D3D11
-    auto dx11_dev = createCOMPtrGuard<ID3D11Device>();
-    auto dx11_ctx = createCOMPtrGuard<ID3D11DeviceContext>();
+        // create DX11 device & context owning handles.
+        // wip::Device & wip::Context provide non-owning semantic of resources and act
+        // as weak references API wrappers in order to carry type-erased resources type
+        // into appropriate modules: onevpl::GSource, PreprocEngine and InferenceEngine
+        // Until modules are not created owner handles must stay alive
+        auto dx11_dev = createCOMPtrGuard<ID3D11Device>();
+        auto dx11_ctx = createCOMPtrGuard<ID3D11DeviceContext>();
 
-    if (device_id.find("GPU") != std::string::npos) {
         auto adapter_factory = createCOMPtrGuard<IDXGIFactory>();
         {
             IDXGIFactory* out_factory = nullptr;
@@ -365,40 +497,102 @@ int main(int argc, char *argv[]) {
         }
 
         std::tie(dx11_dev, dx11_ctx) = create_device_with_ctx(intel_adapter.get());
-        accel_device = cv::util::make_optional(
+        gpu_accel_device = cv::util::make_optional(
                             cv::gapi::wip::onevpl::create_dx11_device(
                                                         reinterpret_cast<void*>(dx11_dev.get()),
-                                                        device_id));
-        accel_ctx = cv::util::make_optional(
+                                                        "GPU"));
+        gpu_accel_ctx = cv::util::make_optional(
                             cv::gapi::wip::onevpl::create_dx11_context(
                                                         reinterpret_cast<void*>(dx11_ctx.get())));
+#endif // HAVE_D3D11
+#endif // HAVE_DIRECTX
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+        static const char *predefined_vaapi_devices_list[] {"/dev/dri/renderD128",
+                                                            "/dev/dri/renderD129",
+                                                            "/dev/dri/card0",
+                                                            "/dev/dri/card1",
+                                                            nullptr};
+        std::stringstream ss;
+        int device_fd = -1;
+        VADisplay va_handle = nullptr;
+        for (const char **device_path = predefined_vaapi_devices_list;
+            *device_path != nullptr; device_path++) {
+            device_fd = open(*device_path, O_RDWR);
+            if (device_fd < 0) {
+                std::string info("Cannot open GPU file: \"");
+                info = info + *device_path + "\", error: " + strerror(errno);
+                ss << info << std::endl;
+                continue;
+            }
+            va_handle = vaGetDisplayDRM(device_fd);
+            if (!va_handle) {
+                close(device_fd);
+                std::string info("VAAPI device vaGetDisplayDRM failed, error: ");
+                info += strerror(errno);
+                ss << info << std::endl;
+                continue;
+            }
+            int major_version = 0, minor_version = 0;
+            VAStatus status {};
+            status = vaInitialize(va_handle, &major_version, &minor_version);
+            if (VA_STATUS_SUCCESS != status) {
+                close(device_fd);
+                va_handle = nullptr;
+
+                std::string info("Cannot initialize VAAPI device, error: ");
+                info += vaErrorStr(status);
+                ss << info << std::endl;
+                continue;
+            }
+            std::cout << "VAAPI created for device: " << *device_path << ", version: "
+                      << major_version << "." << minor_version << std::endl;
+            break;
+        }
 
-        // put accel type description for VPL source
-        source_cfgs.push_back(cfg::create_from_string(
-                                        "mfxImplDescription.AccelerationMode"
-                                        ":"
-                                        "MFX_ACCEL_MODE_VIA_D3D11"));
+        // check device creation
+        if (!va_handle) {
+            std::cerr << "Cannot create VAAPI device. Log:\n" << ss.str() << std::endl;
+            return -1;
+        }
+        gpu_accel_device = cv::util::make_optional(
+                            cv::gapi::wip::onevpl::create_vaapi_device(reinterpret_cast<void*>(va_handle),
+                                                                       "GPU", device_fd));
+        gpu_accel_ctx = cv::util::make_optional(
+                            cv::gapi::wip::onevpl::create_vaapi_context(nullptr));
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // #ifdef __linux__
     }
 
-#endif // HAVE_D3D11
-#endif // HAVE_DIRECTX
-    // set ctx_config for GPU device only - no need in case of CPU device type
-    if (accel_device.has_value() &&
-        accel_device.value().get_name().find("GPU") != std::string::npos) {
+#ifdef HAVE_INF_ENGINE
+    // activate remote ctx in Inference Engine for GPU device
+    // when other pipeline component use the GPU device too
+    if (flow_settings->ie_remote_ctx_enable) {
         InferenceEngine::ParamMap ctx_config({{"CONTEXT_TYPE", "VA_SHARED"},
-                                              {"VA_DEVICE", accel_device.value().get_ptr()} });
+                                              {"VA_DEVICE", gpu_accel_device.value().get_ptr()} });
         face_net.cfgContextParams(ctx_config);
+        std::cout << "enforce InferenceEngine remote context on device: " << device_id << std::endl;
 
         // NB: consider NV12 surface because it's one of native GPU image format
         face_net.pluginConfig({{"GPU_NV12_TWO_INPUTS", "YES" }});
+        std::cout << "enforce InferenceEngine NV12 blob" << std::endl;
     }
 #endif // HAVE_INF_ENGINE
 
-    // turn on preproc
-    if (accel_device.has_value() && accel_ctx.has_value()) {
-        face_net.cfgPreprocessingParams(accel_device.value(),
-                                        accel_ctx.value());
-        std::cout << "enforce VPP preprocessing on " << device_id << std::endl;
+    // turn on VPP PreprocesingEngine if available & requested
+    if (flow_settings->vpl_preproc_enable) {
+        if (is_gpu(preproc_device)) {
+            // activate VPP PreprocesingEngine on GPU
+            face_net.cfgPreprocessingParams(gpu_accel_device.value(),
+                                            gpu_accel_ctx.value());
+        } else {
+            // activate VPP PreprocesingEngine on CPU
+            face_net.cfgPreprocessingParams(cpu_accel_device,
+                                            cpu_accel_ctx);
+        }
+        std::cout << "enforce VPP preprocessing on device: " << preproc_device << std::endl;
+    } else {
+        std::cout << "use InferenceEngine default preprocessing" << std::endl;
     }
 
     auto kernels = cv::gapi::kernels
@@ -414,10 +608,17 @@ int main(int argc, char *argv[]) {
     // Create source
     cv::gapi::wip::IStreamSource::Ptr cap;
     try {
-        if (accel_device.has_value() && accel_ctx.has_value()) {
+        if (is_gpu(source_device)) {
+            std::cout << "enforce VPL Source deconding on device: " << source_device << std::endl;
+            // use special 'Device' constructor for `onevpl::GSource`
+            // put accel type description for VPL source
+            source_cfgs.push_back(cfg::create_from_string(
+                                        "mfxImplDescription.AccelerationMode"
+                                        ":"
+                                        "MFX_ACCEL_MODE_VIA_D3D11"));
             cap = cv::gapi::wip::make_onevpl_src(file_path, source_cfgs,
-                                                 accel_device.value(),
-                                                 accel_ctx.value());
+                                                 gpu_accel_device.value(),
+                                                 gpu_accel_ctx.value());
         } else {
             cap = cv::gapi::wip::make_onevpl_src(file_path, source_cfgs);
         }
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 6647e484b1..eca07ce9df 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -300,6 +300,7 @@ struct IEUnit {
                             cv::util::any_cast<InferenceEngine::ParamMap>(&params.context_config);
         if (ctx_params != nullptr) {
             auto ie_core = cv::gimpl::ie::wrap::getCore();
+            GAPI_LOG_DEBUG(nullptr, "create IE remote ctx for device id: " << params.device_id);
             rctx = ie_core.CreateContext(params.device_id, *ctx_params);
         }
 
@@ -703,45 +704,6 @@ cv::MediaFrame preprocess_frame_impl(cv::MediaFrame &&in_frame, const std::strin
     return std::move(in_frame);
 }
 
-inline IE::Blob::Ptr extractRemoteBlob(IECallContext& ctx, std::size_t i,
-                                       const std::string &layer_name,
-                                       const cv::util::optional<cv::Rect> &opt_roi,
-                                       cv::MediaFrame* out_keep_alive_frame,
-                                       bool* out_is_preprocessed) {
-    GAPI_Assert(ctx.inShape(i) == cv::GShape::GFRAME &&
-                "Remote blob is supported for MediaFrame only");
-    cv::MediaFrame frame = ctx.inFrame(i);
-    if (ctx.uu.preproc_engine_impl) {
-        GAPI_LOG_DEBUG(nullptr, "Try to use preprocessing for decoded remote frame in remote ctx");
-        frame = preprocess_frame_impl(std::move(frame), layer_name, ctx, opt_roi,
-                                      out_keep_alive_frame, out_is_preprocessed);
-    }
-
-    // Request params for result frame whatever it got preprocessed or not
-    cv::util::any any_blob_params = frame.blobParams();
-
-    using ParamType = std::pair<InferenceEngine::TensorDesc, InferenceEngine::ParamMap>;
-    using NV12ParamType = std::pair<ParamType, ParamType>;
-
-    NV12ParamType* blob_params = cv::util::any_cast<NV12ParamType>(&any_blob_params);
-    if (blob_params == nullptr) {
-        GAPI_Assert(false && "Incorrect type of blobParams:"
-                             "expected std::pair<ParamType, ParamType>,"
-                             "with ParamType std::pair<InferenceEngine::TensorDesc,"
-                                                      "InferenceEngine::ParamMap >>");
-    }
-
-    //The parameters are TensorDesc and ParamMap for both y and uv blobs
-    auto y_blob = ctx.uu.rctx->CreateBlob(blob_params->first.first, blob_params->first.second);
-    auto uv_blob = ctx.uu.rctx->CreateBlob(blob_params->second.first, blob_params->second.second);
-
-#if INF_ENGINE_RELEASE >= 2021010000
-    return IE::make_shared_blob<IE::NV12Blob>(y_blob, uv_blob);
-#else
-    return IE::make_shared_blob<InferenceEngine::NV12Blob>(y_blob, uv_blob);
-#endif
-}
-
 inline IE::Blob::Ptr extractBlob(IECallContext& ctx,
                                  std::size_t i,
                                  cv::gapi::ie::TraitAs hint,
@@ -749,11 +711,6 @@ inline IE::Blob::Ptr extractBlob(IECallContext& ctx,
                                  const cv::util::optional<cv::Rect> &opt_roi,
                                  cv::MediaFrame* out_keep_alive_frame = nullptr,
                                  bool* out_is_preprocessed = nullptr) {
-    if (ctx.uu.rctx != nullptr) {
-        return extractRemoteBlob(ctx, i, layer_name, opt_roi,
-                                 out_keep_alive_frame, out_is_preprocessed);
-    }
-
     switch (ctx.inShape(i)) {
         case cv::GShape::GFRAME: {
             auto frame = ctx.inFrame(i);
@@ -762,6 +719,37 @@ inline IE::Blob::Ptr extractBlob(IECallContext& ctx,
                 frame = preprocess_frame_impl(std::move(frame), layer_name, ctx, opt_roi,
                                               out_keep_alive_frame, out_is_preprocessed);
             }
+
+            // NB: check OV remote device context availability.
+            // if it exist and MediaFrame shares the same device context
+            // then we create a remote blob without memory copy
+            if (ctx.uu.rctx != nullptr) {
+                // Request params for result frame whatever it got preprocessed or not
+                cv::util::any any_blob_params = frame.blobParams();
+                using ParamType = std::pair<InferenceEngine::TensorDesc, InferenceEngine::ParamMap>;
+                using NV12ParamType = std::pair<ParamType, ParamType>;
+
+                NV12ParamType* blob_params = cv::util::any_cast<NV12ParamType>(&any_blob_params);
+                if (blob_params == nullptr) {
+                    GAPI_Assert(false && "Incorrect type of blobParams:"
+                                         "expected std::pair<ParamType, ParamType>,"
+                                         "with ParamType std::pair<InferenceEngine::TensorDesc,"
+                                         "InferenceEngine::ParamMap >>");
+                }
+
+                //The parameters are TensorDesc and ParamMap for both y and uv blobs
+                auto y_blob = ctx.uu.rctx->CreateBlob(blob_params->first.first, blob_params->first.second);
+                auto uv_blob = ctx.uu.rctx->CreateBlob(blob_params->second.first, blob_params->second.second);
+
+#if INF_ENGINE_RELEASE >= 2021010000
+                return IE::make_shared_blob<IE::NV12Blob>(y_blob, uv_blob);
+#else
+                return IE::make_shared_blob<InferenceEngine::NV12Blob>(y_blob, uv_blob);
+#endif
+            }
+
+            // NB: If no OV remote context created then use default MediaFrame accessor approach:
+            // it invokes memory copying operation If GPU MediaFrame come
             ctx.views.emplace_back(new cv::MediaFrame::View(frame.access(cv::MediaFrame::Access::R)));
             return wrapIE(*(ctx.views.back()), frame.desc());
         }
@@ -1158,6 +1146,7 @@ static void PostOutputs(InferenceEngine::InferRequest &request,
         ctx->out.post(std::move(output), ctx->eptr);
     }
 
+    ctx->views.clear();
     ctx->releaseKeepAliveFrame(&request);
 }
 
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
index 67ffdf9377..d81c66b901 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
@@ -178,8 +178,8 @@ VPLCPUAccelerationPolicy::create_surface_pool(size_t pool_size, size_t surface_s
         GAPI_LOG_INFO(nullptr, "Released workspace memory: " << ptr);
         ptr = nullptr;
 #else
+        GAPI_LOG_INFO(nullptr, "Workspace memory to release: " << ptr);
         free(ptr);
-        GAPI_LOG_INFO(nullptr, "Released workspace memory: " << ptr);
         ptr = nullptr;
 #endif
 
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
index 8fa0be9914..82bada7b70 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
@@ -27,30 +27,16 @@ namespace onevpl {
 VPLVAAPIAccelerationPolicy::VPLVAAPIAccelerationPolicy(device_selector_ptr_t selector) :
     VPLAccelerationPolicy(selector),
     cpu_dispatcher(new VPLCPUAccelerationPolicy(selector)),
-    va_handle(),
-    device_fd(-1) {
+    va_handle() {
 #if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
-    // TODO Move it out in device selector
-    device_fd = open("/dev/dri/renderD128", O_RDWR);
-    if (device_fd < 0) {
-        GAPI_LOG_WARNING(nullptr, "VAAPI device descriptor \"/dev/dri/renderD128\" has not found");
-        throw std::runtime_error("cannot open VAAPI device");
-    }
-    va_handle = vaGetDisplayDRM(device_fd);
-    if (!va_handle) {
-        GAPI_LOG_WARNING(nullptr, "VAAPI device vaGetDisplayDRM failed, error: " << strerror(errno));
-        close(device_fd);
-        throw std::runtime_error("vaGetDisplayDRM failed");
-    }
-    int major_version = 0, minor_version = 0;
-    VAStatus status {};
-    status = vaInitialize(va_handle, &major_version, &minor_version);
-    if (VA_STATUS_SUCCESS != status) {
-        GAPI_LOG_WARNING(nullptr, "Cannot initialize VAAPI device, error: " << vaErrorStr(status));
-        close(device_fd);
-        throw std::runtime_error("vaInitialize failed");
-    }
-    GAPI_LOG_INFO(nullptr, "created");
+    // setup VAAPI device
+    IDeviceSelector::DeviceScoreTable devices = get_device_selector()->select_devices();
+    GAPI_Assert(devices.size() == 1 && "Multiple(or zero) acceleration  VAAPI devices are not unsupported");
+    AccelType accel_type = devices.begin()->second.get_type();
+    GAPI_Assert(accel_type == AccelType::VAAPI &&
+                "Unexpected device AccelType while is waiting AccelType::VAAPI");
+
+    va_handle = reinterpret_cast<VADisplay>(devices.begin()->second.get_ptr());
 #else  // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
     GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current configuration");
 #endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
@@ -58,7 +44,6 @@ VPLVAAPIAccelerationPolicy::VPLVAAPIAccelerationPolicy(device_selector_ptr_t sel
 
 VPLVAAPIAccelerationPolicy::~VPLVAAPIAccelerationPolicy() {
     vaTerminate(va_handle);
-    close(device_fd);
     GAPI_LOG_INFO(nullptr, "destroyed");
 }
 
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp
index ee7453f982..37a59a0070 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.hpp
@@ -50,7 +50,6 @@ private:
     std::unique_ptr<VPLAccelerationPolicy> cpu_dispatcher;
 #ifdef __linux__
     VADisplay va_handle;
-    int device_fd;  // TODO Move it out in device selector
 #endif // __linux__
 };
 } // namespace onevpl
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp b/modules/gapi/src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp
index 574860e03d..77cfbb18b1 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp
@@ -96,6 +96,8 @@ void LockAdapter::unlock_write(mfxMemId mid, mfxFrameData &data) {
 
 SharedLock* LockAdapter::set_adaptee(SharedLock* new_impl) {
     SharedLock* old_impl = impl;
+    GAPI_LOG_DEBUG(nullptr, "this: " << this <<
+                            ", old: " << old_impl << ", new: " << new_impl);
     GAPI_DbgAssert(old_impl == nullptr || new_impl == nullptr && "Must not be previous impl");
     impl = new_impl;
     return old_impl;
@@ -184,6 +186,8 @@ void DX11AllocationItem::on_first_in_impl(mfxFrameData *ptr) {
     D3D11_MAP mapType = D3D11_MAP_READ;
     UINT mapFlags = D3D11_MAP_FLAG_DO_NOT_WAIT;
 
+    GAPI_LOG_DEBUG(nullptr, "texture: " << get_texture_ptr() <<
+                            ", subresorce: " << get_subresource());
     shared_device_context->CopySubresourceRegion(get_staging_texture_ptr(), 0,
                                                  0, 0, 0,
                                                  get_texture_ptr(),
@@ -245,8 +249,8 @@ mfxStatus DX11AllocationItem::release_access(mfxFrameData *ptr) {
 }
 
 mfxStatus DX11AllocationItem::shared_access_acquire_unsafe(mfxFrameData *ptr) {
-    GAPI_LOG_DEBUG(nullptr, "acquire READ lock: " << this);
-    GAPI_LOG_DEBUG(nullptr, "texture: " << get_texture_ptr() <<
+    GAPI_LOG_DEBUG(nullptr, "acquire READ lock: " << this <<
+                            ", texture: " << get_texture_ptr() <<
                             ", sub id: " << get_subresource());
     // shared access requires elastic barrier
     // first-in visited thread uses resource mapping on host memory
@@ -257,6 +261,7 @@ mfxStatus DX11AllocationItem::shared_access_acquire_unsafe(mfxFrameData *ptr) {
 
     if (!(ptr->Y && (ptr->UV || (ptr->U && ptr->V)))) {
         GAPI_LOG_WARNING(nullptr, "No any data obtained: " << this);
+        GAPI_DbgAssert(false && "shared access must provide data");
         return MFX_ERR_LOCK_MEMORY;
     }
     GAPI_LOG_DEBUG(nullptr, "READ access granted: " << this);
@@ -264,8 +269,8 @@ mfxStatus DX11AllocationItem::shared_access_acquire_unsafe(mfxFrameData *ptr) {
 }
 
 mfxStatus DX11AllocationItem::shared_access_release_unsafe(mfxFrameData *ptr) {
-    GAPI_LOG_DEBUG(nullptr, "releasing READ lock: " << this);
-    GAPI_LOG_DEBUG(nullptr, "texture: " << get_texture_ptr() <<
+    GAPI_LOG_DEBUG(nullptr, "releasing READ lock: " << this <<
+                            ", texture: " << get_texture_ptr() <<
                             ", sub id: " << get_subresource());
     // releasing shared access requires elastic barrier
     // last-out thread must make memory unmapping then and only then no more
@@ -278,8 +283,8 @@ mfxStatus DX11AllocationItem::shared_access_release_unsafe(mfxFrameData *ptr) {
 }
 
 mfxStatus DX11AllocationItem::exclusive_access_acquire_unsafe(mfxFrameData *ptr) {
-    GAPI_LOG_DEBUG(nullptr, "acquire WRITE lock: " << this);
-    GAPI_LOG_DEBUG(nullptr, "texture: " << get_texture_ptr() <<
+    GAPI_LOG_DEBUG(nullptr, "acquire WRITE lock: " << this <<
+                            ", texture: " << get_texture_ptr() <<
                             ", sub id: " << get_subresource());
     D3D11_MAP mapType = D3D11_MAP_WRITE;
     UINT mapFlags = D3D11_MAP_FLAG_DO_NOT_WAIT;
@@ -321,8 +326,8 @@ mfxStatus DX11AllocationItem::exclusive_access_acquire_unsafe(mfxFrameData *ptr)
 }
 
 mfxStatus DX11AllocationItem::exclusive_access_release_unsafe(mfxFrameData *ptr) {
-    GAPI_LOG_DEBUG(nullptr, "releasing WRITE lock: " << this);
-    GAPI_LOG_DEBUG(nullptr, "texture: " << get_texture_ptr() <<
+    GAPI_LOG_DEBUG(nullptr, "releasing WRITE lock: " << this <<
+                            ", texture: " << get_texture_ptr() <<
                             ", sub id: " << get_subresource());
 
     get_device_ctx_ptr()->Unmap(get_staging_texture_ptr(), 0);
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/surface/cpu_frame_adapter.cpp b/modules/gapi/src/streaming/onevpl/accelerators/surface/cpu_frame_adapter.cpp
index 751ed7abbd..24a5b9fb7f 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/surface/cpu_frame_adapter.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/surface/cpu_frame_adapter.cpp
@@ -64,7 +64,7 @@ MediaFrame::View VPLMediaFrameCPUAdapter::access(MediaFrame::Access) {
 }
 
 cv::util::any VPLMediaFrameCPUAdapter::blobParams() const {
-    GAPI_Assert("VPLMediaFrameCPUAdapter::blobParams() is not implemented");
+    throw std::runtime_error("VPLMediaFrameCPUAdapter::blobParams() is not implemented");
     return {};
 }
 
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
index 90bf3e8849..28f01c5718 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
@@ -30,6 +30,16 @@
 #endif // HAVE_D3D11
 #endif // HAVE_DIRECTX
 
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#include "va/va.h"
+#include "va/va_drm.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // __linux__
+
 #include <codecvt>
 #include "opencv2/core/directx.hpp"
 
@@ -37,6 +47,23 @@ namespace cv {
 namespace gapi {
 namespace wip {
 namespace onevpl {
+#ifdef __linux__
+struct Aux {
+    ~Aux() {
+        for (int fd : fds) {
+            close(fd);
+        }
+    }
+
+    void remember_fd(int fd) {
+        fds.insert(fd);
+    }
+private:
+    std::set<int> fds;
+};
+#else
+struct Aux {};
+#endif
 
 static std::vector<CfgParam> insertCfgparam(std::vector<CfgParam> &&param_array, AccelType type) {
     switch (type) {
@@ -153,7 +180,78 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const CfgParams& cfg_params) :
             break;
         }
         case MFX_IMPL_VIA_VAAPI : {
-            GAPI_LOG_WARNING(nullptr, "TODO MFX_IMPL_VIA_VAAPI falls back to CPU case")
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            static const char *predefined_vaapi_devices_list[] {"/dev/dri/renderD128",
+                                                                "/dev/dri/renderD129",
+                                                                "/dev/dri/card0",
+                                                                "/dev/dri/card1",
+                                                                nullptr};
+            std::stringstream ss;
+            int device_fd = -1;
+            VADisplay va_handle;va_handle = nullptr;
+            for (const char **device_path = predefined_vaapi_devices_list;
+                *device_path != nullptr; device_path++) {
+                device_fd = open(*device_path, O_RDWR);
+                if (device_fd < 0) {
+                    std::string info("Cannot open GPU file: \"");
+                    info = info + *device_path + "\", error: " + strerror(errno);
+                    GAPI_LOG_DEBUG(nullptr, info);
+                    ss << info << std::endl;
+                    continue;
+                }
+                va_handle = vaGetDisplayDRM(device_fd);
+                if (!va_handle) {
+                    close(device_fd);
+
+                    std::string info("VAAPI device vaGetDisplayDRM failed, error: ");
+                    info += strerror(errno);
+                    GAPI_LOG_DEBUG(nullptr, info);
+                    ss << info << std::endl;
+                    continue;
+                }
+                int major_version = 0, minor_version = 0;
+                VAStatus status {};
+                status = vaInitialize(va_handle, &major_version, &minor_version);
+                if (VA_STATUS_SUCCESS != status) {
+                    close(device_fd);
+                    va_handle = nullptr;
+
+                    std::string info("Cannot initialize VAAPI device, error: ");
+                    info += vaErrorStr(status);
+                    GAPI_LOG_DEBUG(nullptr, info);
+                    ss << info << std::endl;
+                    continue;
+                }
+                GAPI_LOG_INFO(nullptr, "VAAPI created for device: " << *device_path);
+                break;
+            }
+
+            // check device creation
+            if (!va_handle) {
+                GAPI_LOG_WARNING(nullptr, "Cannot create VAAPI device. Log:\n" << ss.str());
+                throw std::logic_error(std::string("Cannot create device for \"") +
+                                   CfgParam::acceleration_mode_name() +
+                                   ": MFX_IMPL_VIA_VAAPI\"");
+            }
+
+            // Unfortunately VAAPI doesn't provide API for extracting initial FD value from VADisplay, which
+            // value is stored as VADisplay fields, by the way. But, because we here are only one creator
+            // of VAAPI device then we will need make cleanup for all allocated resources by ourselfs
+            //and FD is definitely must be utilized. So, let's use complementary struct `Aux` which
+            // represent some kind of 'platform specific data' and which will store opened FD for
+            // future utilization
+            platform_specific_data.reset (new Aux);
+            platform_specific_data->remember_fd(device_fd);
+
+            suggested_device = IDeviceSelector::create<Device>(va_handle, "GPU", AccelType::VAAPI);
+            suggested_context = IDeviceSelector::create<Context>(nullptr, AccelType::VAAPI);
+#else  // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#else // #ifdef __linux__
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+#endif // #ifdef __linux__
             break;
         }
         case MFX_ACCEL_MODE_NA: {
@@ -234,6 +332,19 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(Device::Ptr device_ptr,
 #endif // #if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             break;
         }
+        case MFX_IMPL_VIA_VAAPI : {
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            suggested_device = IDeviceSelector::create<Device>(device_ptr, device_id, AccelType::VAAPI);
+            suggested_context = IDeviceSelector::create<Context>(nullptr, AccelType::VAAPI);
+#else  // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#else // #ifdef __linux__
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+#endif // #ifdef __linux__
+            break;
+        }
         case MFX_ACCEL_MODE_NA: {
             GAPI_LOG_WARNING(nullptr, "Incompatible \"" <<  CfgParam::acceleration_mode_name() <<
                                       ": MFX_ACCEL_MODE_NA\" with "
@@ -284,7 +395,13 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const Device &device,
 #endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
         }
         case AccelType::VAAPI:
-            GAPI_LOG_WARNING(nullptr, "TODO MFX_IMPL_VIA_VAAPI falls back to CPU case")
+#ifdef __linux__
+#if !defined(HAVE_VA) || !defined(HAVE_VA_INTEL)
+            GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#else // #ifdef __linux__
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+#endif // #ifdef __linux__
             break;
         case AccelType::HOST:
             break;
@@ -332,6 +449,15 @@ CfgParamDeviceSelector::~CfgParamDeviceSelector() {
 #endif // defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
             break;
         }
+        case AccelType::VAAPI: {
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            VADisplay va_handle = reinterpret_cast<VADisplay>(suggested_device.get_ptr());
+            vaTerminate(va_handle);
+            platform_specific_data.reset();
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // #ifdef __linux__
+        }
         default:
             break;
     }
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
index 5dae1c508d..c09218c41e 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
@@ -20,6 +20,7 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 
+class Aux;
 struct GAPI_EXPORTS CfgParamDeviceSelector final: public IDeviceSelector {
     CfgParamDeviceSelector(const CfgParams& params = {});
     CfgParamDeviceSelector(Device::Ptr device_ptr,
@@ -37,6 +38,7 @@ struct GAPI_EXPORTS CfgParamDeviceSelector final: public IDeviceSelector {
 private:
     Device suggested_device;
     Context suggested_context;
+    std::unique_ptr<Aux> platform_specific_data;
 };
 } // namespace onevpl
 } // namespace wip
diff --git a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
index d58d1d3d3c..0b374177da 100644
--- a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
+++ b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
@@ -119,6 +119,19 @@ Context create_dx11_context(Context::Ptr ctx_ptr) {
                                                                 AccelType::DX11);
 }
 
+Device create_vaapi_device(Device::Ptr device_ptr,
+                           const std::string& device_name,
+                           int file_description) {
+    return detail::DeviceContextCreator::create_entity<Device>(device_ptr,
+                                                               device_name,
+                                                               AccelType::VAAPI);
+}
+
+Context create_vaapi_context(Context::Ptr ctx_ptr) {
+    return detail::DeviceContextCreator::create_entity<Context>(ctx_ptr,
+                                                                AccelType::VAAPI);
+}
+
 } // namespace onevpl
 } // namespace wip
 } // namespace gapi
diff --git a/modules/gapi/src/streaming/onevpl/source_priv.cpp b/modules/gapi/src/streaming/onevpl/source_priv.cpp
index 765bdd3b64..f460a2a6ed 100644
--- a/modules/gapi/src/streaming/onevpl/source_priv.cpp
+++ b/modules/gapi/src/streaming/onevpl/source_priv.cpp
@@ -321,11 +321,16 @@ std::unique_ptr<VPLAccelerationPolicy> GSource::Priv::initializeHWAccel(std::sha
 
 const std::vector<CfgParam>& GSource::Priv::getDefaultCfgParams()
 {
+#ifdef __WIN32__
     static const std::vector<CfgParam> def_params =
         get_params_from_string<CfgParam>(
                     "mfxImplDescription.Impl: MFX_IMPL_TYPE_HARDWARE\n"
                     "mfxImplDescription.AccelerationMode: MFX_ACCEL_MODE_VIA_D3D11\n");
-
+#else
+    static const std::vector<CfgParam> def_params =
+        get_params_from_string<CfgParam>(
+                    "mfxImplDescription.Impl: MFX_IMPL_TYPE_HARDWARE\n");
+#endif
     return def_params;
 }
 
diff --git a/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp b/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
index ee1be9f433..47c548368b 100644
--- a/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_vpl_device_selector.cpp
@@ -29,6 +29,16 @@
 #endif // HAVE_D3D11
 #endif // HAVE_DIRECTX
 
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#include "va/va.h"
+#include "va/va_drm.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // __linux__
+
 #ifdef HAVE_ONEVPL
 #include "streaming/onevpl/onevpl_export.hpp"
 #include "streaming/onevpl/cfg_param_device_selector.hpp"
@@ -208,6 +218,131 @@ TEST(OneVPL_Source_Device_Selector_CfgParam, DX11DeviceFromCfgParamWithDX11Disab
 #endif // HAVE_D3D11
 #endif // HAVE_DIRECTX
 
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithEmptyCfgParam_VAAPI_ENABLED)
+{
+    using namespace cv::gapi::wip::onevpl;
+    std::vector<CfgParam> empty_params;
+    CfgParamDeviceSelector selector(empty_params);
+    IDeviceSelector::DeviceScoreTable devs = selector.select_devices();
+    EXPECT_TRUE(devs.size() == 1);
+    test_host_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority);
+
+    IDeviceSelector::DeviceContexts ctxs = selector.select_context();
+    EXPECT_TRUE(ctxs.size() == 1);
+    test_host_ctx_eq(*ctxs.begin());
+}
+
+TEST(OneVPL_Source_Device_Selector_CfgParam, DefaultDeviceWithVAAPIAccelCfgParam_VAAPI_ENABLED)
+{
+    using namespace cv::gapi::wip::onevpl;
+    std::vector<CfgParam> cfg_params_w_vaapi;
+    cfg_params_w_vaapi.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));
+    std::unique_ptr<CfgParamDeviceSelector> selector_ptr;
+    EXPECT_NO_THROW(selector_ptr.reset(new CfgParamDeviceSelector(cfg_params_w_vaapi)));
+    IDeviceSelector::DeviceScoreTable devs = selector_ptr->select_devices();
+
+    EXPECT_TRUE(devs.size() == 1);
+    test_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority,
+                AccelType::VAAPI,
+                std::get<1>(*devs.begin()).get_ptr() /* compare just type */);
+
+    IDeviceSelector::DeviceContexts ctxs = selector_ptr->select_context();
+    EXPECT_TRUE(ctxs.size() == 1);
+    EXPECT_FALSE(ctxs.begin()->get_ptr());
+}
+
+TEST(OneVPL_Source_Device_Selector_CfgParam, NULLDeviceWithVAAPIAccelCfgParam_VAAPI_ENABLED)
+{
+    using namespace cv::gapi::wip::onevpl;
+    std::vector<CfgParam> cfg_params_w_vaapi;
+    cfg_params_w_vaapi.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));
+    Device::Ptr empty_device_ptr = nullptr;
+    Context::Ptr empty_ctx_ptr = nullptr;
+    EXPECT_THROW(CfgParamDeviceSelector sel(empty_device_ptr, "GPU",
+                                            empty_ctx_ptr,
+                                            cfg_params_w_vaapi),
+                 std::logic_error); // empty_device_ptr must be invalid
+}
+
+
+TEST(OneVPL_Source_Device_Selector_CfgParam, ExternalDeviceWithVAAPIAccelCfgParam_VAAPI_ENABLED)
+{
+    using namespace cv::gapi::wip::onevpl;
+    VADisplay va_handle = nullptr;
+    struct FileDescriptorRAII {
+        FileDescriptorRAII() :fd (-1) {}
+        ~FileDescriptorRAII() { reset(-1); }
+        void reset(int d) {
+            if (fd != -1) {
+                close(fd);
+            }
+            fd = d;
+        }
+        operator int() { return fd; }
+    private:
+        FileDescriptorRAII(FileDescriptorRAII& src) = delete;
+        FileDescriptorRAII& operator=(FileDescriptorRAII& src) = delete;
+        FileDescriptorRAII(FileDescriptorRAII&& src) = delete;
+        FileDescriptorRAII& operator=(FileDescriptorRAII&& src) = delete;
+        int fd = -1;
+    };
+    static const char *predefined_vaapi_devices_list[] {"/dev/dri/renderD128",
+                                                        "/dev/dri/renderD129",
+                                                        "/dev/dri/card0",
+                                                        "/dev/dri/card1",
+                                                        nullptr};
+
+    FileDescriptorRAII device_fd;
+    for (const char **device_path = predefined_vaapi_devices_list;
+        *device_path != nullptr; device_path++) {
+        device_fd.reset(open(*device_path, O_RDWR));
+        if (device_fd < 0) {
+            continue;
+        }
+        va_handle = vaGetDisplayDRM(device_fd);
+        if (!va_handle) {
+            continue;
+        }
+        int major_version = 0, minor_version = 0;
+        VAStatus status {};
+        status = vaInitialize(va_handle, &major_version, &minor_version);
+        if (VA_STATUS_SUCCESS != status) {
+            close(device_fd);
+            va_handle = nullptr;
+            continue;
+        }
+        break;
+    }
+    EXPECT_TRUE(device_fd != -1);
+    EXPECT_TRUE(va_handle);
+    auto device = cv::util::make_optional(
+                            cv::gapi::wip::onevpl::create_vaapi_device(reinterpret_cast<void*>(va_handle),
+                                                                       "GPU", device_fd));
+    auto device_context = cv::util::make_optional(
+                            cv::gapi::wip::onevpl::create_vaapi_context(nullptr));
+
+    std::unique_ptr<CfgParamDeviceSelector> selector_ptr;
+    std::vector<CfgParam> cfg_params_w_vaapi;
+    cfg_params_w_vaapi.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));
+    EXPECT_NO_THROW(selector_ptr.reset(new CfgParamDeviceSelector(device.value(),
+                                                                  device_context.value(),
+                                                                  cfg_params_w_vaapi)));
+    IDeviceSelector::DeviceScoreTable devs = selector_ptr->select_devices();
+
+    EXPECT_TRUE(devs.size() == 1);
+    test_dev_eq(*devs.begin(), IDeviceSelector::Score::MaxActivePriority,
+                AccelType::VAAPI, device.value().get_ptr());
+
+    IDeviceSelector::DeviceContexts ctxs = selector_ptr->select_context();
+    EXPECT_TRUE(ctxs.size() == 1);
+    EXPECT_EQ(reinterpret_cast<void*>(ctxs.begin()->get_ptr()),
+              device_context.value().get_ptr());
+}
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // #ifdef __linux__
+
 TEST(OneVPL_Source_Device_Selector_CfgParam, UnknownPtrDeviceFromCfgParam)
 {
     using namespace cv::gapi::wip::onevpl;

From b3b235ebc0be024c251b1689f9f184ea2b35ec36 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Fri, 8 Jul 2022 17:15:31 +0300
Subject: [PATCH 113/178] Apply WITHA_VA option in cmake

---
 modules/gapi/CMakeLists.txt                   | 30 +++++++++----------
 .../gapi/samples/onevpl_infer_single_roi.cpp  |  2 --
 .../onevpl/device_selector_interface.cpp      |  2 +-
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 4e45c4432f..63373ad9c4 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -320,21 +320,17 @@ if(HAVE_GAPI_ONEVPL)
   endif()
 
   if(UNIX)
-    find_package(PkgConfig)
-    if(PkgConfig_FOUND)
-      pkg_check_modules(PKG_LIBVA libva>=1.2 libva-drm>=1.2)
-      if(PKG_LIBVA_FOUND)
-        set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
-        set(THREADS_PREFER_PTHREAD_FLAG TRUE)
-        find_package(Threads REQUIRED)
-      else()
-        message(FATAL_ERROR "libva not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
+    if(WITH_VA)
+      include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindVA.cmake")
+      if(VA_INCLUDE_DIR)
+        ocv_target_include_directories(${the_module} SYSTEM PRIVATE ${VA_INCLUDE_DIR})
+        ocv_target_include_directories(opencv_test_gapi SYSTEM PRIVATE ${VA_INCLUDE_DIR})
+        ocv_target_link_libraries(${the_module} PRIVATE ${VA_LIBRARIES})
+        ocv_target_link_libraries(opencv_test_gapi PRIVATE ${VA_LIBRARIES})
       endif()
-    else()
-      message(FATAL_ERROR "PkgConfig not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
-    endif()
-    ocv_target_link_libraries(${the_module} PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
-    ocv_target_link_libraries(opencv_test_gapi PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
+    else(WITH_VA)
+      message(FATAL_ERROR "libva not found: building HAVE_GAPI_ONEVPL without libVA support is impossible on UNIX systems")
+    endif(WITH_VA)
   endif()
 endif()
 
@@ -374,8 +370,10 @@ if(TARGET example_gapi_onevpl_infer_single_roi)
   if(HAVE_D3D11 AND HAVE_OPENCL)
     ocv_target_include_directories(example_gapi_onevpl_infer_single_roi SYSTEM PRIVATE ${OPENCL_INCLUDE_DIRS})
   endif()
-  if(PKG_LIBVA_FOUND)
-    ocv_target_link_libraries(example_gapi_onevpl_infer_single_roi PRIVATE ${PKG_LIBVA_LIBRARIES} ${PKG_THREAD_LIBRARIES})
+  if(WITH_VA AND UNIX)
+    message ("GAPI VPL samples with VAAPI")
+    ocv_target_include_directories(example_gapi_onevpl_infer_single_roi SYSTEM PRIVATE ${VA_INCLUDE_DIR})
+    ocv_target_link_libraries(example_gapi_onevpl_infer_single_roi PRIVATE ${VA_LIBRARIES})
   endif()
 endif()
 
diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index ed8c74ea8c..15f3813787 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -102,7 +102,6 @@ cv::util::optional<cv::Rect> parse_roi(const std::string &rc) {
     return cv::util::make_optional(std::move(rv));
 }
 
-#ifdef HAVE_INF_ENGINE
 #ifdef HAVE_DIRECTX
 #ifdef HAVE_D3D11
 
@@ -152,7 +151,6 @@ AccelParamsType create_device_with_ctx(IDXGIAdapter* adapter) {
 }
 #endif // HAVE_D3D11
 #endif // HAVE_DIRECTX
-#endif // HAVE_INF_ENGINE
 } // anonymous namespace
 
 namespace custom {
diff --git a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
index 0b374177da..ecc02cd830 100644
--- a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
+++ b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
@@ -121,7 +121,7 @@ Context create_dx11_context(Context::Ptr ctx_ptr) {
 
 Device create_vaapi_device(Device::Ptr device_ptr,
                            const std::string& device_name,
-                           int file_description) {
+                           int /*file_description*/) {
     return detail::DeviceContextCreator::create_entity<Device>(device_ptr,
                                                                device_name,
                                                                AccelType::VAAPI);

From db706769333a4cd5f4a9d5176dcf790256aefc23 Mon Sep 17 00:00:00 2001
From: Rong Mantle Bao <baorong2005@126.com>
Date: Sat, 9 Jul 2022 07:20:44 +0800
Subject: [PATCH 114/178] Use cross-platform std::chrono in getTickCount()

Add conditional compilation directives to enable uses of std::chrono on supported compilers. Use std::chrono::steady_clock as a source to retrieve current tick count and clock frequency.

Fixes opencv/opencv#6902.
---
 modules/core/src/system.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index adb957908d..dda62fb89e 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -302,6 +302,16 @@ DECLARE_CV_CPUID_X86
 #endif
 
 
+#if ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+  #define HAVE_CXX11 1
+#else
+  #define HAVE_CXX11 0
+#endif
+
+#if HAVE_CXX11
+  #include <chrono>
+#endif
+
 namespace cv
 {
 
@@ -840,7 +850,10 @@ bool useOptimized(void)
 
 int64 getTickCount(void)
 {
-#if defined _WIN32 || defined WINCE
+#if HAVE_CXX11
+    std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+    return (int64)now.time_since_epoch().count();
+#elif defined _WIN32 || defined WINCE
     LARGE_INTEGER counter;
     QueryPerformanceCounter( &counter );
     return (int64)counter.QuadPart;
@@ -859,7 +872,11 @@ int64 getTickCount(void)
 
 double getTickFrequency(void)
 {
-#if defined _WIN32 || defined WINCE
+#if HAVE_CXX11
+    using clock_period_t = std::chrono::steady_clock::duration::period;
+    double clock_freq = clock_period_t::den / clock_period_t::num;
+    return clock_freq;
+#elif defined _WIN32 || defined WINCE
     LARGE_INTEGER freq;
     QueryPerformanceFrequency(&freq);
     return (double)freq.QuadPart;

From 7d3dbcb0273ae1c89b585506031152aa0cf011be Mon Sep 17 00:00:00 2001
From: Michael Klatis <michael.klatis@autodesk.com>
Date: Sat, 9 Jul 2022 21:45:58 -0700
Subject: [PATCH 115/178] dynamically linked libs on mobile

---
 platforms/android/build_sdk.py   | 5 +++++
 platforms/ios/build_framework.py | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 88cb5ff87f..f445be10f6 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -160,6 +160,7 @@ class Builder:
         self.no_samples_build = True if config.no_samples_build else False
         self.opencl = True if config.opencl else False
         self.no_kotlin = True if config.no_kotlin else False
+        self.shared = True if config.shared else False
 
     def get_cmake(self):
         if not self.config.use_android_buildtools and check_executable(['cmake', '--version']):
@@ -245,6 +246,9 @@ class Builder:
         if self.no_kotlin:
             cmake_vars['BUILD_KOTLIN_EXTENSIONS'] = "OFF"
 
+        if self.shared:
+            cmake_vars['BUILD_SHARED_LIBS'] = "ON"
+
         if self.config.modules_list is not None:
             cmd.append("-DBUILD_LIST='%s'" % self.config.modules_list)
 
@@ -365,6 +369,7 @@ if __name__ == "__main__":
     parser.add_argument('--no_samples_build', action="store_true", help="Do not build samples (speeds up build)")
     parser.add_argument('--opencl', action="store_true", help="Enable OpenCL support")
     parser.add_argument('--no_kotlin', action="store_true", help="Disable Kotlin extensions")
+    parser.add_argument('--shared', action="store_true", help="Build shared libraries")
     args = parser.parse_args()
 
     log.basicConfig(format='%(message)s', level=log.DEBUG)
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index 08d9fd63dc..12e7fddd98 100755
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -379,6 +379,13 @@ class Builder:
                 "-framework", "CoreImage", "-framework", "CoreMedia", "-framework", "QuartzCore",
                 "-framework", "Accelerate", "-framework", "OpenCL",
             ]
+        elif target_platform == "iphoneos" or target_platform == "iphonesimulator":
+            framework_options = [
+                "-iframework", "%s/System/iOSSupport/System/Library/Frameworks" % sdk_dir,
+                "-framework", "AVFoundation", "-framework", "CoreGraphics",
+                "-framework", "CoreImage", "-framework", "CoreMedia", "-framework", "QuartzCore",
+                "-framework", "Accelerate", "-framework", "UIKit", "-framework", "CoreVideo",
+            ]
         execute([
             "clang++",
             "-Xlinker", "-rpath",

From fa613e393f96ef7de246a418761af72f36a0c390 Mon Sep 17 00:00:00 2001
From: Rong Mantle Bao <baorong2005@126.com>
Date: Sun, 10 Jul 2022 17:34:07 +0800
Subject: [PATCH 116/178] Read CV_CXX11 for C++11 detection

---
 modules/core/src/system.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index dda62fb89e..3b6877105d 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -301,14 +301,7 @@ DECLARE_CV_CPUID_X86
   #endif
 #endif
 
-
-#if ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-  #define HAVE_CXX11 1
-#else
-  #define HAVE_CXX11 0
-#endif
-
-#if HAVE_CXX11
+#if defined CV_CXX11
   #include <chrono>
 #endif
 
@@ -850,7 +843,7 @@ bool useOptimized(void)
 
 int64 getTickCount(void)
 {
-#if HAVE_CXX11
+#if defined CV_CXX11
     std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
     return (int64)now.time_since_epoch().count();
 #elif defined _WIN32 || defined WINCE
@@ -872,7 +865,7 @@ int64 getTickCount(void)
 
 double getTickFrequency(void)
 {
-#if HAVE_CXX11
+#if defined CV_CXX11
     using clock_period_t = std::chrono::steady_clock::duration::period;
     double clock_freq = clock_period_t::den / clock_period_t::num;
     return clock_freq;

From 3635b3dee70d5da125e19e277bd949d7ea455ff7 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Tue, 12 Jul 2022 10:54:12 +0300
Subject: [PATCH 117/178] Improve UX: autofill cfg param with proper accel
 type, printout cfg params

---
 .../opencv2/gapi/streaming/onevpl/cfg_params.hpp  |  2 ++
 modules/gapi/samples/onevpl_infer_single_roi.cpp  |  7 +------
 .../onevpl/accelerators/accel_policy_va_api.cpp   |  1 +
 .../onevpl/cfg_param_device_selector.cpp          |  2 +-
 .../onevpl/cfg_param_device_selector.hpp          |  2 ++
 modules/gapi/src/streaming/onevpl/cfg_params.cpp  | 15 +++++++++++++++
 .../onevpl/engine/decode/decode_engine_legacy.cpp |  3 +++
 modules/gapi/src/streaming/onevpl/source.cpp      |  2 +-
 modules/gapi/src/streaming/onevpl/source_priv.cpp | 14 ++++++++++----
 9 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/modules/gapi/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp b/modules/gapi/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
index 25573afd4c..0db9a86e58 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
@@ -185,6 +185,8 @@ struct GAPI_EXPORTS CfgParam {
     const name_t& get_name() const;
     const value_t& get_value() const;
     bool is_major() const;
+    std::string to_string() const;
+
     bool operator==(const CfgParam& rhs) const;
     bool operator< (const CfgParam& rhs) const;
     bool operator!=(const CfgParam& rhs) const;
diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index 15f3813787..8759295f6a 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -59,7 +59,7 @@ const std::string keys =
     "{ vpp_frames_pool_size         | 0                                         | OneVPL source applies this parameter as preallocated frames pool size for VPP preprocessing results}"
     "{ roi                          | -1,-1,-1,-1                               | Region of interest (ROI) to use for inference. Identified automatically when not set }"
     "{ source_device                | CPU                                       | choose device for decoding }"
-    "{ preproc_device               | CPU                                       | choose device for preprocessing }";
+    "{ preproc_device               |                                           | choose device for preprocessing }";
 
 
 namespace {
@@ -609,11 +609,6 @@ int main(int argc, char *argv[]) {
         if (is_gpu(source_device)) {
             std::cout << "enforce VPL Source deconding on device: " << source_device << std::endl;
             // use special 'Device' constructor for `onevpl::GSource`
-            // put accel type description for VPL source
-            source_cfgs.push_back(cfg::create_from_string(
-                                        "mfxImplDescription.AccelerationMode"
-                                        ":"
-                                        "MFX_ACCEL_MODE_VIA_D3D11"));
             cap = cv::gapi::wip::make_onevpl_src(file_path, source_cfgs,
                                                  gpu_accel_device.value(),
                                                  gpu_accel_ctx.value());
diff --git a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
index 82bada7b70..ca5f1de94c 100644
--- a/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
+++ b/modules/gapi/src/streaming/onevpl/accelerators/accel_policy_va_api.cpp
@@ -28,6 +28,7 @@ VPLVAAPIAccelerationPolicy::VPLVAAPIAccelerationPolicy(device_selector_ptr_t sel
     VPLAccelerationPolicy(selector),
     cpu_dispatcher(new VPLCPUAccelerationPolicy(selector)),
     va_handle() {
+    GAPI_LOG_INFO(nullptr, "created - TODO dispatchered on CPU acceleration");
 #if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
     // setup VAAPI device
     IDeviceSelector::DeviceScoreTable devices = get_device_selector()->select_devices();
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
index 28f01c5718..050c744447 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
@@ -65,7 +65,7 @@ private:
 struct Aux {};
 #endif
 
-static std::vector<CfgParam> insertCfgparam(std::vector<CfgParam> &&param_array, AccelType type) {
+std::vector<CfgParam> update_param_with_accel_type(std::vector<CfgParam> &&param_array, AccelType type) {
     switch (type) {
         case AccelType::HOST:
             break;
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
index c09218c41e..14c3998f13 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
@@ -21,6 +21,8 @@ namespace wip {
 namespace onevpl {
 
 class Aux;
+std::vector<CfgParam> update_param_with_accel_type(std::vector<CfgParam> &&param_array, AccelType type);
+
 struct GAPI_EXPORTS CfgParamDeviceSelector final: public IDeviceSelector {
     CfgParamDeviceSelector(const CfgParams& params = {});
     CfgParamDeviceSelector(Device::Ptr device_ptr,
diff --git a/modules/gapi/src/streaming/onevpl/cfg_params.cpp b/modules/gapi/src/streaming/onevpl/cfg_params.cpp
index b13f9cadb1..155fef70e5 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_params.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_params.cpp
@@ -4,6 +4,7 @@
 //
 // Copyright (C) 2021 Intel Corporation
 
+#include <sstream>
 #include <opencv2/gapi/util/throw.hpp>
 
 #include <opencv2/gapi/streaming/onevpl/cfg_params.hpp>
@@ -25,6 +26,15 @@ struct variant_comparator : cv::util::static_visitor<bool, variant_comparator> {
 private:
     const CfgParam::value_t& rhs;
 };
+
+struct variant_stringifier : cv::util::static_visitor<std::string, variant_stringifier> {
+    template<typename ValueType>
+    std::string visit(const ValueType& lhs) const {
+        std::stringstream ss;
+        ss << lhs;
+        return ss.str();
+    }
+};
 } // namespace util
 
 struct CfgParam::Priv {
@@ -228,6 +238,11 @@ bool CfgParam::is_major() const {
     return m_priv->is_major_impl();
 }
 
+std::string CfgParam::to_string() const {
+    return get_name() + ":" + cv::util::visit(util::variant_stringifier{},
+                                              get_value());
+}
+
 bool CfgParam::operator< (const CfgParam& rhs) const {
     return *m_priv < *rhs.m_priv;
 }
diff --git a/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp b/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
index 34db1bebfa..0ab8301799 100644
--- a/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
+++ b/modules/gapi/src/streaming/onevpl/engine/decode/decode_engine_legacy.cpp
@@ -186,6 +186,9 @@ VPLLegacyDecodeEngine::SessionParam VPLLegacyDecodeEngine::prepare_session_param
         mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_VIDEO_MEMORY;
     } else if (accel_type == AccelType::HOST) {
         mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
+    } else if (accel_type == AccelType::VAAPI) {
+        // TODO make proper direction
+        mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
     } else {
         GAPI_Assert(false && "unsupported AccelType from device selector");
     }
diff --git a/modules/gapi/src/streaming/onevpl/source.cpp b/modules/gapi/src/streaming/onevpl/source.cpp
index 3bad463e41..efcc9bf850 100644
--- a/modules/gapi/src/streaming/onevpl/source.cpp
+++ b/modules/gapi/src/streaming/onevpl/source.cpp
@@ -36,7 +36,7 @@ GSource::GSource(const std::string& filePath,
 GSource::GSource(const std::string& filePath,
                  const CfgParams& cfg_params,
                  const Device &device, const Context &ctx) :
-    GSource(filePath, cfg_params,
+    GSource(filePath, update_param_with_accel_type(CfgParams{cfg_params}, device.get_type()),
             std::make_shared<CfgParamDeviceSelector>(device, ctx, cfg_params)) {
 }
 
diff --git a/modules/gapi/src/streaming/onevpl/source_priv.cpp b/modules/gapi/src/streaming/onevpl/source_priv.cpp
index f460a2a6ed..e8d26b41e2 100644
--- a/modules/gapi/src/streaming/onevpl/source_priv.cpp
+++ b/modules/gapi/src/streaming/onevpl/source_priv.cpp
@@ -94,12 +94,12 @@ GSource::Priv::Priv(std::shared_ptr<IDataProvider> provider,
          GAPI_Assert(cfg_inst && "MFXCreateConfig failed");
 
         if (!cfg_param_it->is_major()) {
-            GAPI_LOG_DEBUG(nullptr, "Skip not major param: " << cfg_param_it->get_name());
+            GAPI_LOG_DEBUG(nullptr, "Skip not major param: " << cfg_param_it->to_string());
             ++cfg_param_it;
             continue;
         }
 
-        GAPI_LOG_DEBUG(nullptr, "Apply major param: " << cfg_param_it->get_name());
+        GAPI_LOG_DEBUG(nullptr, "Apply major param: " << cfg_param_it->to_string());
         mfxVariant mfx_param = cfg_param_to_mfx_variant(*cfg_param_it);
         mfxStatus sts = MFXSetConfigFilterProperty(cfg_inst,
                                                    (mfxU8 *)cfg_param_it->get_name().c_str(),
@@ -189,8 +189,14 @@ GSource::Priv::Priv(std::shared_ptr<IDataProvider> provider,
 
     // Extract the most suitable VPL implementation by max score
     auto max_match_it = matches_count.rbegin();
-    GAPI_Assert(max_match_it != matches_count.rend() &&
-                "Cannot find matched MFX implementation for requested configuration");
+    if (max_match_it == matches_count.rend()) {
+        std::stringstream ss;
+        for (const auto &p : cfg_params) {
+            ss << p.to_string() << std::endl;
+        }
+        GAPI_LOG_WARNING(nullptr, "No one suitable MFX implementation is found, requested params:\n" << ss.str());
+        throw std::runtime_error("Cannot find any suitable MFX implementation for requested configuration");
+    }
 
     // TODO impl_number is global for now
     impl_number = max_match_it->second;

From 2b2ba534e2ba4ee7f9ef89f1cec9136061974b94 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Tue, 12 Jul 2022 12:41:23 +0300
Subject: [PATCH 118/178] Forbid Rctx in IE for VAAPI temporary, Add preproc
 VAAPI

---
 .../gapi/samples/onevpl_infer_single_roi.cpp  |  6 ++-
 .../engine/preproc_engine_interface.cpp       | 51 ++++++++++++++-----
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index 8759295f6a..ff080aceec 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -300,9 +300,13 @@ support_matrix resolved_conf{{
                                         {"CPU", {{ "CPU", {/* unsupported: preproc mix */}},
                                                  { "GPU", {/* unsupported: preproc mix */}}
                                                 }},
-
+#if defined(HAVE_DIRECTX) && defined(HAVE_D3D11)
                                         {"GPU", {{ "CPU", std::make_shared<flow>(true, false)},
                                                  { "GPU", std::make_shared<flow>(true, true)}}}
+#else   // TODO VAAPI under linux doesn't support GPU IE remote context
+                                        {"GPU", {{ "CPU", std::make_shared<flow>(true, false)},
+                                                 { "GPU", std::make_shared<flow>(true, false)}}}
+#endif
                                     }}
                             },
                             {"CPU", {{
diff --git a/modules/gapi/src/streaming/onevpl/engine/preproc_engine_interface.cpp b/modules/gapi/src/streaming/onevpl/engine/preproc_engine_interface.cpp
index ff9f103b5a..b5787320d6 100644
--- a/modules/gapi/src/streaming/onevpl/engine/preproc_engine_interface.cpp
+++ b/modules/gapi/src/streaming/onevpl/engine/preproc_engine_interface.cpp
@@ -14,6 +14,7 @@
 
 #include "streaming/onevpl/accelerators/accel_policy_dx11.hpp"
 #include "streaming/onevpl/accelerators/accel_policy_cpu.hpp"
+#include "streaming/onevpl/accelerators/accel_policy_va_api.hpp"
 #include "streaming/onevpl/accelerators/surface/surface.hpp"
 #include "streaming/onevpl/cfg_param_device_selector.hpp"
 #include "streaming/onevpl/cfg_params_parser.hpp"
@@ -41,30 +42,56 @@ IPreprocEngine::create_preproc_engine_impl(const onevpl::Device &device,
     cv::util::suppress_unused_warning(context);
     std::unique_ptr<VPPPreprocDispatcher> dispatcher(new VPPPreprocDispatcher);
 #ifdef HAVE_ONEVPL
-    if (device.get_type() == onevpl::AccelType::DX11) {
-        bool gpu_pp_is_created = false;
+    bool pp_is_created = false;
+    switch (device.get_type()) {
+        case onevpl::AccelType::DX11: {
+            GAPI_LOG_INFO(nullptr, "Creating DX11 VPP preprocessing engine");
 #ifdef HAVE_DIRECTX
 #ifdef HAVE_D3D11
-        GAPI_LOG_INFO(nullptr, "Creating DX11 VPP preprocessing engine");
-        // create GPU VPP preproc engine
-        dispatcher->insert_worker<VPPPreprocEngine>(
+            // create GPU VPP preproc engine
+            dispatcher->insert_worker<VPPPreprocEngine>(
                                 std::unique_ptr<VPLAccelerationPolicy>{
                                         new VPLDX11AccelerationPolicy(
                                             std::make_shared<CfgParamDeviceSelector>(
                                                     device, context, CfgParams{}))
                                 });
-        GAPI_LOG_INFO(nullptr, "DX11 VPP preprocessing engine created");
-        gpu_pp_is_created = true;
+            GAPI_LOG_INFO(nullptr, "DX11 VPP preprocessing engine created");
+            pp_is_created = true;
 #endif
 #endif
-        GAPI_Assert(gpu_pp_is_created && "VPP preproc for GPU is requested, but it is avaiable only for DX11 at now");
-    } else {
-        GAPI_LOG_INFO(nullptr, "Creating CPU VPP preprocessing engine");
-        dispatcher->insert_worker<VPPPreprocEngine>(
+            break;
+        }
+        case onevpl::AccelType::VAAPI: {
+            GAPI_LOG_INFO(nullptr, "Creating VAAPI VPP preprocessing engine");
+#ifdef __linux__
+#if defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+            // create GPU VPP preproc engine
+            dispatcher->insert_worker<VPPPreprocEngine>(
+                                std::unique_ptr<VPLAccelerationPolicy>{
+                                        new VPLVAAPIAccelerationPolicy(
+                                            std::make_shared<CfgParamDeviceSelector>(
+                                                    device, context, CfgParams{}))
+                                });
+            GAPI_LOG_INFO(nullptr, "VAAPI VPP preprocessing engine created");
+            pp_is_created = true;
+#endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
+#endif // #ifdef __linux__
+            break;
+        }
+        default: {
+            GAPI_LOG_INFO(nullptr, "Creating CPU VPP preprocessing engine");
+            dispatcher->insert_worker<VPPPreprocEngine>(
                         std::unique_ptr<VPLAccelerationPolicy>{
                                 new VPLCPUAccelerationPolicy(
                                     std::make_shared<CfgParamDeviceSelector>(CfgParams{}))});
-        GAPI_LOG_INFO(nullptr, "CPU VPP preprocessing engine created");
+            GAPI_LOG_INFO(nullptr, "CPU VPP preprocessing engine created");
+            pp_is_created = true;
+            break;
+        }
+    }
+    if (!pp_is_created) {
+        GAPI_LOG_WARNING(nullptr, "Cannot create VPP preprocessing engine: configuration unsupported");
+        GAPI_Assert(false && "VPP preproc unsupported");
     }
 #endif // HAVE_ONEVPL
     return dispatcher;

From ff31f90b7e6451ecdb69fd8e9aa332ada3286ba8 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Tue, 12 Jul 2022 13:08:55 +0300
Subject: [PATCH 119/178] Fix sample WIN compilation

---
 modules/gapi/samples/onevpl_infer_single_roi.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index ff080aceec..0b330b383d 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -16,6 +16,7 @@
 
 #ifdef HAVE_INF_ENGINE
 #include <inference_engine.hpp> // ParamMap
+#endif // HAVE_INF_ENGINE
 
 #ifdef HAVE_DIRECTX
 #ifdef HAVE_D3D11
@@ -29,10 +30,8 @@
 #pragma comment(lib, "dxgi")
 #undef NOMINMAX
 #undef D3D11_NO_HELPERS
-
 #endif // HAVE_D3D11
 #endif // HAVE_DIRECTX
-#endif // HAVE_INF_ENGINE
 
 #ifdef __linux__
 #if defined(HAVE_VA) || defined(HAVE_VA_INTEL)

From 87ef6a9cc13749a3fba918350ffd97ce06753d80 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Tue, 12 Jul 2022 19:42:39 +0900
Subject: [PATCH 120/178] build: suppress the warning DRY bump the version

---
 3rdparty/carotene/CMakeLists.txt          | 2 +-
 3rdparty/carotene/hal/CMakeLists.txt      | 2 +-
 cmake/OpenCVDetectHalide.cmake            | 2 +-
 modules/python/CMakeLists.txt             | 2 +-
 platforms/android/android.toolchain.cmake | 2 +-
 platforms/linux/gnu.toolchain.cmake       | 2 +-
 samples/CMakeLists.txt                    | 2 +-
 samples/cpp/example_cmake/CMakeLists.txt  | 2 +-
 samples/hal/c_hal/CMakeLists.txt          | 2 +-
 samples/hal/slow_hal/CMakeLists.txt       | 2 +-
 samples/openvx/CMakeLists.txt             | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt
index bd26a2d7ef..3d49a2def6 100644
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
 
 project(Carotene)
 
diff --git a/3rdparty/carotene/hal/CMakeLists.txt b/3rdparty/carotene/hal/CMakeLists.txt
index 0162aae101..6f209981a6 100644
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
 
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
diff --git a/cmake/OpenCVDetectHalide.cmake b/cmake/OpenCVDetectHalide.cmake
index 4828c299ae..dc484a7d3d 100644
--- a/cmake/OpenCVDetectHalide.cmake
+++ b/cmake/OpenCVDetectHalide.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE})
 
 if(" ${HALIDE_ROOT_DIR}" STREQUAL " ")
   unset(HALIDE_ROOT_DIR CACHE)
diff --git a/modules/python/CMakeLists.txt b/modules/python/CMakeLists.txt
index a51acf386e..990156074a 100644
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@@ -40,7 +40,7 @@ add_subdirectory(python3)
 
 else()  # standalone build
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8.12.2)
 project(OpenCVPython CXX C)
 include("./standalone.cmake")
 
diff --git a/platforms/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
index 50b342c7a6..1dca060fdf 100644
--- a/platforms/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@@ -189,7 +189,7 @@
 #
 # ------------------------------------------------------------------------------
 
-cmake_minimum_required( VERSION 2.6.3 )
+cmake_minimum_required( VERSION 2.8.12.2 )
 
 if( DEFINED CMAKE_CROSSCOMPILING )
  # subsequent toolchain loading is not really needed
diff --git a/platforms/linux/gnu.toolchain.cmake b/platforms/linux/gnu.toolchain.cmake
index cba08e7fbb..d8f6c571ca 100644
--- a/platforms/linux/gnu.toolchain.cmake
+++ b/platforms/linux/gnu.toolchain.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8.12.2)
 
 # load settings in case of "try compile"
 set(TOOLCHAIN_CONFIG_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain.config.cmake")
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 68afc487a2..bc8d5097ef 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -56,7 +56,7 @@ else()
 #  Standalone mode
 #
 #===================================================================================================
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8.12.2)
 
 project(samples C CXX)
 option(BUILD_EXAMPLES "Build samples" ON)
diff --git a/samples/cpp/example_cmake/CMakeLists.txt b/samples/cpp/example_cmake/CMakeLists.txt
index c669660cef..fb8000ae24 100644
--- a/samples/cpp/example_cmake/CMakeLists.txt
+++ b/samples/cpp/example_cmake/CMakeLists.txt
@@ -1,5 +1,5 @@
 # cmake needs this line
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8.12.2)
 
 # Define project name
 project(opencv_example_project)
diff --git a/samples/hal/c_hal/CMakeLists.txt b/samples/hal/c_hal/CMakeLists.txt
index 8502779e84..8cf78aa5ff 100644
--- a/samples/hal/c_hal/CMakeLists.txt
+++ b/samples/hal/c_hal/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 2.8.12.2 FATAL_ERROR)
 
 set(PROJECT_NAME "c_hal")
 set(HAL_LIB_NAME "c_hal")
diff --git a/samples/hal/slow_hal/CMakeLists.txt b/samples/hal/slow_hal/CMakeLists.txt
index d42fb0b6fd..1ffa4670b6 100644
--- a/samples/hal/slow_hal/CMakeLists.txt
+++ b/samples/hal/slow_hal/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 2.8.12.2 FATAL_ERROR)
 
 set(PROJECT_NAME "slow_hal")
 set(HAL_LIB_NAME "slow_hal")
diff --git a/samples/openvx/CMakeLists.txt b/samples/openvx/CMakeLists.txt
index 701184d369..fd04e6b9e2 100644
--- a/samples/openvx/CMakeLists.txt
+++ b/samples/openvx/CMakeLists.txt
@@ -1,6 +1,6 @@
 ocv_install_example_src(cpp *.cpp *.hpp CMakeLists.txt)
 
-cmake_minimum_required(VERSION 2.8.9)
+cmake_minimum_required(VERSION 2.8.12.2)
 
 set(OPENCV_OPENVX_SAMPLE_REQUIRED_DEPS
   opencv_core

From 3135063100bba50fe0fff3c83a732e77eeb62ba6 Mon Sep 17 00:00:00 2001
From: Rong Mantle Bao <baorong2005@126.com>
Date: Mon, 11 Jul 2022 18:36:36 +0800
Subject: [PATCH 121/178] Make objdetect/test C++11-compliant and reproducible

- Add conditional compilation directives to replace deprecated std::random_shuffle with new std::shuffle when C++11 is available.

- Set random seed to a fixed value before shuffling containers to ensure reproducibility.

Resolves opencv/opencv#22209.
---
 modules/objdetect/test/test_precomp.hpp       |  6 ++++++
 modules/objdetect/test/test_qrcode_encode.cpp | 21 +++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/modules/objdetect/test/test_precomp.hpp b/modules/objdetect/test/test_precomp.hpp
index 4131b43995..14f8b0ede2 100644
--- a/modules/objdetect/test/test_precomp.hpp
+++ b/modules/objdetect/test/test_precomp.hpp
@@ -8,4 +8,10 @@
 #include "opencv2/objdetect.hpp"
 #include "opencv2/objdetect/objdetect_c.h"
 
+#if defined CV_CXX11
+  #include <random>
+#else
+  #include <cstdlib>
+#endif
+
 #endif
diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp
index 351e6abc83..9212f338a0 100644
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@@ -5,6 +5,16 @@
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {
 
+#if !defined CV_CXX11
+// Wrapper for generating seeded random number via std::rand.
+template<unsigned Seed>
+class SeededRandFunctor {
+public:
+    SeededRandFunctor() { std::srand(Seed); }
+    int operator()(int i) { return std::rand() % (i + 1); }
+};
+#endif
+
 std::string encode_qrcode_images_name[] = {
         "version1_mode1.png", "version1_mode2.png", "version1_mode4.png",
         "version2_mode1.png", "version2_mode2.png", "version2_mode4.png",
@@ -381,8 +391,15 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
         std::string symbol_set = config["symbols_set"];
 
         std::string input_info = symbol_set;
-        std::random_shuffle(input_info.begin(), input_info.end());
-
+#if defined CV_CXX11
+        // std::random_shuffle is deprecated since C++11 and removed in C++17.
+        // Use manually constructed RNG with a fixed seed and std::shuffle instead.
+        std::mt19937 rand_gen {1};
+        std::shuffle(input_info.begin(), input_info.end(), rand_gen);
+#else
+        SeededRandFunctor<1> rand_gen;
+        std::random_shuffle(input_info.begin(), input_info.end(), rand_gen);
+#endif
         for (int j = min_stuctures_num; j < max_stuctures_num; j++)
         {
             QRCodeEncoder::Params params;

From ed4bf13960e68c48ced45f53f3cd3c1b41f67b9d Mon Sep 17 00:00:00 2001
From: Vincent Rabaud <vrabaud@google.com>
Date: Tue, 12 Jul 2022 16:24:44 +0200
Subject: [PATCH 122/178] Fix cleanFoundConnectedQuads.

---
 modules/calib3d/src/calibinit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 1dff8de6e6..309f7bbe58 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -1256,7 +1256,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
             Mat points(1, quad_count, CV_32FC2, &centers[0]);
             cv::convexHull(points, hull, true);
             centers[skip] = temp;
-            double hull_area = contourArea(hull, true);
+            double hull_area = contourArea(hull, false);
 
             // remember smallest box area
             if (hull_area < min_box_area)

From 45fbb67abac39631f4447e4f3958998cd200c366 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 13 Jul 2022 11:25:27 +0800
Subject: [PATCH 123/178] fix scale layer can not handle 1x1 weight correctly.

---
 modules/dnn/src/layers/scale_layer.cpp  | 10 +++++++
 modules/dnn/src/onnx/onnx_importer.cpp  | 37 +++++++++++++++----------
 modules/dnn/test/test_onnx_importer.cpp |  2 ++
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 594b0bb624..406d27b46a 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -91,6 +91,16 @@ public:
         if (hasWeights && hasBias)
             CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
 
+        if (weights.total() == 1)
+        {
+            // The total() of bias should be same as weights.
+            if (hasBias)
+                inpBlob.convertTo(outBlob, CV_32F, weights.at<float>(0), bias.at<float>(0));
+            else
+                inpBlob.convertTo(outBlob, CV_32F, weights.at<float>(0));
+            return;
+        }
+
         int endAxis;
         for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
         {
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 7a0532fcf4..8221855e33 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1818,6 +1818,8 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
 
 void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
 {
+    // Currently, this function can only complete 1-dimensional expansion of broadShape.
+    // If there are two dimensions in broadShape that need to be expended, it will fail.
     const size_t diff = outShape.size() - broadShape.size();
 
     // find the first non-one element of the broadcasting shape
@@ -1982,25 +1984,30 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
         const MatShape& outShape = outShapes[node_proto.input(0)];
 
         size_t axis = 0;
-        int broadAxis = -1;
-        findBroadAxis(broadShape, outShape, axis, broadAxis);
-
-        // if there is a one dimension in the middle that should be broadcasted, broadcast it
-        if (broadAxis != -1)
+        if (total(broadShape) != 1)
         {
-            opencv_onnx::NodeProto concat_node_proto = node_proto;
-            const std::string& input1 = concat_node_proto.input(1);
+            // If broadShape is a scalar, we set axis as 0.
+            // Other-wise, we check broadcast is available.
+            int broadAxis = -1;
+            findBroadAxis(broadShape, outShape, axis, broadAxis);
+
+            // if there is a one dimension in the middle that should be broadcasted, broadcast it
+            if (broadAxis != -1)
+            {
+                opencv_onnx::NodeProto concat_node_proto = node_proto;
+                const std::string& input1 = concat_node_proto.input(1);
 
-            expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
+                expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
 
-            LayerParams concatLP;
-            concatLP.name = layerParams.name + "/concat";
-            concatLP.set("axis", broadAxis);
-            concatLP.type = "Concat";
-            concat_node_proto.set_output(0, concatLP.name);
+                LayerParams concatLP;
+                concatLP.name = layerParams.name + "/concat";
+                concatLP.set("axis", broadAxis);
+                concatLP.type = "Concat";
+                concat_node_proto.set_output(0, concatLP.name);
 
-            addLayer(concatLP, concat_node_proto);
-            node_proto.set_input(1, concatLP.name);
+                addLayer(concatLP, concat_node_proto);
+                node_proto.set_input(1, concatLP.name);
+            }
         }
 
         CV_Assert(axis != outShape.size());
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 56203cba56..3d1dd3858a 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -725,6 +725,8 @@ TEST_P(Test_ONNX_layers, Div)
 
     normAssert(ref, out, "", default_l1,  default_lInf);
     expectNoFallbacksFromIE(net);
+
+    testONNXModels("div_test_1x1",npy, 0, 0, false, true, 2);
 }
 
 TEST_P(Test_ONNX_layers, DynamicReshape)

From 1b8fba8e260ef6251c0168c23052773b8e9d1466 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 13 Jul 2022 13:46:16 +0800
Subject: [PATCH 124/178] support ReduceSum with two input and dynamic shape
 batch size in ReduceLayer.

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 54 +++++++++++++++++--------
 modules/dnn/test/test_onnx_importer.cpp |  2 +
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index ebbda98d51..7390d0307a 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1180,32 +1180,43 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
     layerParams.set("reduce", reduceType);
     bool keepdims = layerParams.get<int>("keepdims", 1) == 1;
 
-    if (layer_type == "ReduceSum" && node_proto.input_size() == 2)
-    {
-        // TODO support the opset 13 of ReduceSum.
-        //  in opset 13, the ReduceSum has two input, it takes axes as input instead of attribute
-        //  details:https://github.com/onnx/onnx/issues/3420#issuecomment-844295687
-        CV_Error(Error::StsNotImplemented, "Unsupported " + layer_type + " operation of opset 13, please try to "
-                                                                         "re-export the onnx model with opset 11.");
-    }
-
     MatShape inpShape = outShapes[node_proto.input(0)];
     std::vector<bool> shouldDelete(inpShape.size(), false);
 
-    if (layerParams.has("axes"))
+    if (layer_type == "ReduceSum" && node_proto.input_size() == 2)
     {
-        DictValue axes = layerParams.get("axes");
-        for (int i = 0; i < axes.size(); i++)
+        if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
         {
-            int axis = normalize_axis(axes.get<int>(i), inpShape.size());
-            shouldDelete[axis] = true;
+            Mat axesMat = getBlob(node_proto, 1);
+            int axesNum = axesMat.total();
+            for (int i = 0; i < axesNum; i++)
+            {
+                int axis = normalize_axis(static_cast<int>(axesMat.at<float>(i)), inpShape.size());
+                shouldDelete[axis] = true;
+            }
         }
+        else
+            //  in opset 13, the ReduceSum has two input, it takes axes as input instead of attribute
+            //  details:https://github.com/onnx/onnx/issues/3420#issuecomment-844295687
+            CV_Error(Error::StsNotImplemented, "Non-constant axis values in ReduceSum are not supported.");
     }
     else
     {
-        for (int i = 0; i < inpShape.size(); i++)
+        if (layerParams.has("axes"))
         {
-            shouldDelete[i] = true;
+            DictValue axes = layerParams.get("axes");
+            for (int i = 0; i < axes.size(); i++)
+            {
+                int axis = normalize_axis(axes.get<int>(i), inpShape.size());
+                shouldDelete[axis] = true;
+            }
+        }
+        else
+        {
+            for (int i = 0; i < inpShape.size(); i++)
+            {
+                shouldDelete[i] = true;
+            }
         }
     }
 
@@ -1291,6 +1302,17 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
     layerParams.type = (depth == CV_8S) ? "ReshapeInt8" : "Reshape";
     layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
 
+    // Set batchsize dim as dynamic to be compatible with batch size >= 2.
+    if (targetShape[0] == 1 && targetShape.size() > 1)
+    {
+        std::vector<int> dynamicAxes = {0};  // The index of batchsize dim is 0.
+        std::vector<int> inputIndices = {0};
+
+        layerParams.set("has_dynamic_shapes", true);
+        layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
+        layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
+    }
+
     node_proto.set_input(0, node_proto.output(0));
     node_proto.set_output(0, output_name);
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 578e0442b2..5f94f9884d 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -411,6 +411,8 @@ TEST_P(Test_ONNX_layers, ReduceMean)
 TEST_P(Test_ONNX_layers, ReduceSum)
 {
     testONNXModels("reduce_sum");
+    testONNXModels("reduce_sum_axis");
+    testONNXModels("reduce_sum_axis_dynamic_batch");
 }
 
 TEST_P(Test_ONNX_layers, ReduceMax)

From 1dee848d3ef5045e1e7fc919e31fcb0efc5b0159 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Wed, 13 Jul 2022 18:07:41 +0300
Subject: [PATCH 125/178] Remove clnn header from sample

---
 modules/gapi/samples/onevpl_infer_single_roi.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index 0b330b383d..80b1499f30 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -25,7 +25,6 @@
 // get rid of generate macro max/min/etc from DX side
 #define D3D11_NO_HELPERS
 #define NOMINMAX
-#include <cldnn/cldnn_config.hpp>
 #include <d3d11.h>
 #pragma comment(lib, "dxgi")
 #undef NOMINMAX

From 9ffb67478fd42319b3dd42c43026daaf6587a0e2 Mon Sep 17 00:00:00 2001
From: SergeyIvanov87 <sergius140608@rambler.ru>
Date: Wed, 13 Jul 2022 18:49:48 +0300
Subject: [PATCH 126/178] Address several comments

---
 .../opencv2/gapi/streaming/onevpl/accel_types.hpp  |  3 +--
 modules/gapi/samples/onevpl_infer_single_roi.cpp   |  2 +-
 .../streaming/onevpl/cfg_param_device_selector.cpp | 14 +++++++-------
 .../streaming/onevpl/cfg_param_device_selector.hpp |  4 ++--
 .../streaming/onevpl/device_selector_interface.cpp |  3 +--
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
index 119188d96a..b670aebd1d 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
@@ -66,8 +66,7 @@ GAPI_EXPORTS Device create_dx11_device(Device::Ptr device_ptr,
 GAPI_EXPORTS Context create_dx11_context(Context::Ptr ctx_ptr);
 
 GAPI_EXPORTS Device create_vaapi_device(Device::Ptr device_ptr,
-                                       const std::string& device_name,
-                                       int file_description = -1);
+                                        const std::string& device_name);
 GAPI_EXPORTS Context create_vaapi_context(Context::Ptr ctx_ptr);
 } // namespace onevpl
 } // namespace wip
diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
index 80b1499f30..4734b12f3f 100644
--- a/modules/gapi/samples/onevpl_infer_single_roi.cpp
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -557,7 +557,7 @@ int main(int argc, char *argv[]) {
         }
         gpu_accel_device = cv::util::make_optional(
                             cv::gapi::wip::onevpl::create_vaapi_device(reinterpret_cast<void*>(va_handle),
-                                                                       "GPU", device_fd));
+                                                                       "GPU"));
         gpu_accel_ctx = cv::util::make_optional(
                             cv::gapi::wip::onevpl::create_vaapi_context(nullptr));
 #endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
index 050c744447..68d4fce1b9 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
@@ -48,21 +48,21 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 #ifdef __linux__
-struct Aux {
-    ~Aux() {
+struct PlatformSpecificParams {
+    ~PlatformSpecificParams() {
         for (int fd : fds) {
             close(fd);
         }
     }
 
-    void remember_fd(int fd) {
+    void track_fd(int fd) {
         fds.insert(fd);
     }
 private:
     std::set<int> fds;
 };
 #else
-struct Aux {};
+struct PlatformSpecificParams {};
 #endif
 
 std::vector<CfgParam> update_param_with_accel_type(std::vector<CfgParam> &&param_array, AccelType type) {
@@ -238,11 +238,11 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const CfgParams& cfg_params) :
             // Unfortunately VAAPI doesn't provide API for extracting initial FD value from VADisplay, which
             // value is stored as VADisplay fields, by the way. But, because we here are only one creator
             // of VAAPI device then we will need make cleanup for all allocated resources by ourselfs
-            //and FD is definitely must be utilized. So, let's use complementary struct `Aux` which
+            //and FD is definitely must be utilized. So, let's use complementary struct `PlatformSpecificParams` which
             // represent some kind of 'platform specific data' and which will store opened FD for
             // future utilization
-            platform_specific_data.reset (new Aux);
-            platform_specific_data->remember_fd(device_fd);
+            platform_specific_data.reset (new PlatformSpecificParams);
+            platform_specific_data->track_fd(device_fd);
 
             suggested_device = IDeviceSelector::create<Device>(va_handle, "GPU", AccelType::VAAPI);
             suggested_context = IDeviceSelector::create<Context>(nullptr, AccelType::VAAPI);
diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
index 14c3998f13..f7672ce924 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.hpp
@@ -20,7 +20,7 @@ namespace gapi {
 namespace wip {
 namespace onevpl {
 
-class Aux;
+class PlatformSpecificParams;
 std::vector<CfgParam> update_param_with_accel_type(std::vector<CfgParam> &&param_array, AccelType type);
 
 struct GAPI_EXPORTS CfgParamDeviceSelector final: public IDeviceSelector {
@@ -40,7 +40,7 @@ struct GAPI_EXPORTS CfgParamDeviceSelector final: public IDeviceSelector {
 private:
     Device suggested_device;
     Context suggested_context;
-    std::unique_ptr<Aux> platform_specific_data;
+    std::unique_ptr<PlatformSpecificParams> platform_specific_data;
 };
 } // namespace onevpl
 } // namespace wip
diff --git a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
index ecc02cd830..9e192cf0a4 100644
--- a/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
+++ b/modules/gapi/src/streaming/onevpl/device_selector_interface.cpp
@@ -120,8 +120,7 @@ Context create_dx11_context(Context::Ptr ctx_ptr) {
 }
 
 Device create_vaapi_device(Device::Ptr device_ptr,
-                           const std::string& device_name,
-                           int /*file_description*/) {
+                           const std::string& device_name) {
     return detail::DeviceContextCreator::create_entity<Device>(device_ptr,
                                                                device_name,
                                                                AccelType::VAAPI);

From d2b1e382070e4580bf4486317b8ca740ac5fda01 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 14 Jul 2022 11:17:38 +0300
Subject: [PATCH 127/178] Pragma to ignore -Waggressive-loop-optimizations on
 Linux ARM64

---
 modules/features2d/src/sift.simd.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index b5033459b9..dfe113b9b3 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -829,10 +829,14 @@ else // CV_8U
         v_pack_store(dst + k, __pack01);
     }
 #endif
+// avoid warning "iteration 7 invokes undefined behavior" on Linux ARM64
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
     for( ; k < len; k++ )
     {
         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
     }
+#pragma GCC diagnostic pop
 }
 #else
     float* dst = dstMat.ptr<float>(row);

From 1c7b71bf9e32065299d966e4ee4d233e0a19c8de Mon Sep 17 00:00:00 2001
From: fengyuentau <yuantao.feng@opencv.org.cn>
Date: Thu, 14 Jul 2022 19:27:20 +0800
Subject: [PATCH 128/178] define data_layout as unknown for pack

---
 modules/dnn/src/tensorflow/tf_importer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index cd413c1ad7..80a8b6dfc5 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -2541,6 +2541,8 @@ void TFImporter::parsePack(tensorflow::GraphDef& net, const tensorflow::NodeDef&
     if (dim != 0)
         CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation.");
 
+    data_layouts[name] = DATA_LAYOUT_UNKNOWN;
+
     CV_Assert(hasLayerAttr(layer, "N"));
     int num = (int)getLayerAttr(layer, "N").i();
     CV_CheckEQ(num_inputs, num, "");

From b4b69ae484db98ae7e8f6baf302eb215d43122c4 Mon Sep 17 00:00:00 2001
From: Berke <iamberkeyavas@gmail.com>
Date: Thu, 14 Jul 2022 16:45:47 +0300
Subject: [PATCH 129/178] release png,jpeg resources in destructor

---
 modules/imgcodecs/src/grfmt_jpeg.cpp | 4 ----
 modules/imgcodecs/src/grfmt_png.cpp  | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index 060a4c4901..506cebdf49 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -260,9 +260,6 @@ bool  JpegDecoder::readHeader()
         }
     }
 
-    if( !result )
-        close();
-
     return result;
 }
 
@@ -512,7 +509,6 @@ bool  JpegDecoder::readData( Mat& img )
         }
     }
 
-    close();
     return result;
 }
 
diff --git a/modules/imgcodecs/src/grfmt_png.cpp b/modules/imgcodecs/src/grfmt_png.cpp
index 9e1a2d4c71..388a3fcaf4 100644
--- a/modules/imgcodecs/src/grfmt_png.cpp
+++ b/modules/imgcodecs/src/grfmt_png.cpp
@@ -214,9 +214,6 @@ bool  PngDecoder::readHeader()
         }
     }
 
-    if( !result )
-        close();
-
     return result;
 }
 
@@ -304,7 +301,6 @@ bool  PngDecoder::readData( Mat& img )
         }
     }
 
-    close();
     return result;
 }
 

From bd54ba911db662ab2ec32d0c378bc431c1473d5d Mon Sep 17 00:00:00 2001
From: scorpiozj <scorpiozj@gmail.com>
Date: Sun, 10 Jul 2022 20:39:04 +0800
Subject: [PATCH 130/178] Update imgproc.hpp

fix Rect value of integral image example;
---
 modules/imgproc/include/opencv2/imgproc.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index fd1a5d9721..9ab8999e30 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2725,7 +2725,7 @@ It makes possible to do a fast blurring or fast block correlation with a variabl
 example. In case of multi-channel images, sums for each channel are accumulated independently.
 
 As a practical example, the next figure shows the calculation of the integral of a straight
-rectangle Rect(3,3,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
+rectangle Rect(4,4,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
 original image are shown, as well as the relative pixels in the integral images sum and tilted .
 
 ![integral calculation example](pics/integral.png)

From 199c2d2fd090a273ef963d9b13d28cd552657081 Mon Sep 17 00:00:00 2001
From: Long Nguyen <nguyen.long.908132@gmail.com>
Date: Fri, 15 Jul 2022 18:00:53 +0700
Subject: [PATCH 131/178] Define `STRSAFE_NO_DEPRECATE` for both clang and gcc
 mingw-w64

---
 modules/videoio/src/cap_dshow.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index e4de20bcac..db42601dab 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -90,8 +90,11 @@ Thanks to:
 
 #if defined(__clang__)  // clang or MSVC clang
 #pragma clang diagnostic ignored "-Wnon-virtual-dtor"
-#elif defined(__GNUC__) // MinGW
+#elif defined(__GNUC__) // gcc
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+
+#ifdef __MINGW32__
 #define STRSAFE_NO_DEPRECATE
 #endif
 

From 60846b2b7aabcf1406d763c7e7f9155481f9783d Mon Sep 17 00:00:00 2001
From: David Geldreich <david.geldreich@free.fr>
Date: Fri, 15 Jul 2022 19:35:32 +0200
Subject: [PATCH 132/178] correct an issue of trackbar on macOS

Python binding is passing NULL as (int*)value.
sliderChanged crash when trying to dereference value
---
 modules/highgui/src/window_cocoa.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/highgui/src/window_cocoa.mm b/modules/highgui/src/window_cocoa.mm
index e9cf5a3c1b..ef3f828cf8 100644
--- a/modules/highgui/src/window_cocoa.mm
+++ b/modules/highgui/src/window_cocoa.mm
@@ -1201,7 +1201,7 @@ static NSSize constrainAspectRatio(NSSize base, NSSize constraint) {
     (void)notification;
     int pos = [slider intValue];
     NSString *temp = [self initialName];
-    NSString *text = [NSString stringWithFormat:@"%@ %d", temp, *value];
+    NSString *text = [NSString stringWithFormat:@"%@ %d", temp, pos];
     [name setStringValue: text];
     if(value)
         *value = pos;

From 33b1e76e4811c5ddd915ceac7698843f077251da Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Mon, 18 Jul 2022 12:53:04 +0300
Subject: [PATCH 133/178] fix save_frames parameter

---
 apps/interactive-calibration/parametersController.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/interactive-calibration/parametersController.cpp b/apps/interactive-calibration/parametersController.cpp
index b5a01b1ea8..09f8d337c2 100644
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@@ -89,7 +89,7 @@ bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
     mCapParams.captureDelay = parser.get<float>("d");
     mCapParams.squareSize = parser.get<float>("sz");
     mCapParams.templDst = parser.get<float>("dst");
-    mCapParams.saveFrames = parser.has("save_frames");
+    mCapParams.saveFrames = parser.get<bool>("save_frames");
     mCapParams.zoom = parser.get<float>("zoom");
 
     if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))

From ed69bcae2d171d9426cd3688a8b0ee14b8a140cd Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Tue, 19 Jul 2022 06:14:05 +0300
Subject: [PATCH 134/178] Merge pull request #21865 from
 rogday:nary_eltwise_layers

Reimplementation of Element-wise layers with broadcasting support

* init

* semi-working initial version

* add small_vector

* wip

* remove smallvec

* add nary function

* replace auto with Mat in lambda expr used in transform

* uncomment asserts

* autobuffer shape_buf & step_buf

* fix a missing bracket

* fixed a missing addLayer in parseElementWise

* solve one-dimensional broadcast

* remove pre_broadcast_transform for the case of two constants; fix missing constBlobsExtraInfo when addConstant is called

* one autobuffer for step & shape

* temporal fix for the missing original dimension information

* fix parseUnsqueeze when it gets a 1d tensor constant

* support sum/mean/min/max with only one input

* reuse old code to handle cases of two non-constant inputs

* add condition to handle div & mul of two non-constant inputs

* use || instead of or

* remove trainling spaces

* enlarge buf in binary_forward to contain other buffer

* use autobuffer in nary_forward

* generate data randomly and add more cases for perf

* add op and, or & xor

* update perf_dnn

* remove some comments

* remove legacy; add two ONNX conformance tests in filter

* move from cpu_denylist to all_denylist

* adjust parsing for inputs>=2

Co-authored-by: fengyuentau <yuantao.feng@opencv.org.cn>
---
 .../dnn/include/opencv2/dnn/all_layers.hpp    |   6 +
 modules/dnn/perf/perf_layer.cpp               | 150 ++++
 modules/dnn/src/init.cpp                      |   1 +
 .../dnn/src/layers/nary_eltwise_layers.cpp    | 664 ++++++++++++++++++
 modules/dnn/src/onnx/onnx_importer.cpp        | 468 ++++--------
 ...e_layer_filter_opencv_all_denylist.inl.hpp |   2 +
 ...e_layer_filter_opencv_cpu_denylist.inl.hpp |   1 +
 7 files changed, 952 insertions(+), 340 deletions(-)
 create mode 100644 modules/dnn/src/layers/nary_eltwise_layers.cpp

diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 5c86da2be4..6ecf85da7c 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -849,6 +849,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS NaryEltwiseLayer : public Layer
+    {
+    public:
+        static Ptr<NaryEltwiseLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS BatchNormLayer : public ActivationLayer
     {
     public:
diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
index 06fa57f319..03ba8ab0e9 100644
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -55,7 +55,156 @@ struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
     }
 };
 
+struct Layer_NaryEltwise : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& a_shape, const std::vector<int>& b_shape, const String op, bool isRef = false)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat a(a_shape, CV_32FC1);
+        Mat b(b_shape, CV_32FC1);
+
+        Scalar mean = 0.f;
+        Scalar std = 1.f;
+        randn(a, mean, std);
+        randn(b, mean, std);
+
+
+        Net net;
+        LayerParams lp;
+        if (isRef)
+            lp.type = "Eltwise";
+        else
+            lp.type = "NaryEltwise";
+        lp.name = "testLayer";
+        lp.set("operation", op);
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 1, id, 1);
+
+        // warmup
+        {
+            std::vector<String> inpNames(2);
+            inpNames[0] = "a";
+            inpNames[1] = "b";
+            net.setInputsNames(inpNames);
+            net.setInput(a, inpNames[0]);
+            net.setInput(b, inpNames[1]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+
+    int N = 8;
+    int C = 256;
+    int H = 128;
+    int W = 100;
+};
+
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_add)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "add");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_div)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "div");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_ref_div)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "div", true);
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_equal)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "equal");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_greater)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "greater");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_less)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "less");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_max)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "max");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_ref_max)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "max", true);
+}
 
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_mean)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "mean");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_min)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "min");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_ref_min)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "min", true);
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_mul)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "mul");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_ref_mul)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "prod", true);
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_pow)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "pow");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_sub)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "sub");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_sum)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "sum");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_NCHW_ref_sum)
+{
+    test_layer({N, C, H, W}, {N, C, H, W}, "sum", true);
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NCHW_C_sum)
+{
+    test_layer({N, C, H, W}, {C, 1, 1}, "sum");
+}
+
+PERF_TEST_P_(Layer_NaryEltwise, NHWC_C)
+{
+    test_layer({N, H, W, C}, {1, C}, "sum");
+}
 
 PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
 {
@@ -91,5 +240,6 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
 }
 
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 
 } // namespace
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 6979d1864d..e3ce6de40d 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -150,6 +150,7 @@ void initializeLayerFactory()
 
     CV_DNN_REGISTER_LAYER_CLASS(Crop,           CropLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Eltwise,        EltwiseLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(NaryEltwise,    NaryEltwiseLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Permute,        PermuteLayer);
     CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannel, ShuffleChannelLayer);
     CV_DNN_REGISTER_LAYER_CLASS(PriorBox,       PriorBoxLayer);
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
new file mode 100644
index 0000000000..db37f16060
--- /dev/null
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -0,0 +1,664 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+
+namespace cv
+{
+namespace dnn
+{
+
+class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
+{
+public:
+    enum class OPERATION
+    {
+        AND = 0,
+        EQUAL,
+        GREATER,
+        GREATER_EQUAL,
+        LESS,
+        LESS_EQUAL,
+        OR,
+        POW,
+        XOR,
+        BITSHIFT,
+        MAX,
+        MEAN,
+        MIN,
+        MOD,
+        PROD,
+        SUB,
+        SUM,
+        ADD,
+        DIV,
+    } op;
+
+    NaryEltwiseLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+
+        String operation = toLowerCase(params.get<String>("operation", "sum"));
+
+        if (operation == "equal")
+            op = OPERATION::EQUAL;
+        else if (operation == "greater")
+            op = OPERATION::GREATER;
+        else if (operation == "greater_equal")
+            op = OPERATION::GREATER_EQUAL;
+        else if (operation == "less")
+            op = OPERATION::LESS;
+        else if (operation == "less_equal")
+            op = OPERATION::LESS_EQUAL;
+        else if (operation == "pow")
+            op = OPERATION::POW;
+        else if (operation == "bitshift")
+            op = OPERATION::BITSHIFT;
+        else if (operation == "max")
+            op = OPERATION::MAX;
+        else if (operation == "mean")
+            op = OPERATION::MEAN;
+        else if (operation == "min")
+            op = OPERATION::MIN;
+        else if (operation == "mod")
+            op = OPERATION::MOD;
+        else if (operation == "mul")
+            op = OPERATION::PROD;
+        else if (operation == "sub")
+            op = OPERATION::SUB;
+        else if (operation == "sum")
+            op = OPERATION::SUM;
+        else if (operation == "add")
+            op = OPERATION::ADD;
+        else if (operation == "div")
+            op = OPERATION::DIV;
+        else if (operation == "and")
+            op = OPERATION::AND;
+        else if (operation == "or")
+            op = OPERATION::OR;
+        else if (operation == "xor")
+            op = OPERATION::XOR;
+        else
+            CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    static MatShape findCommonShape(std::vector<MatShape> shapes)
+    {
+        CV_Assert(!shapes.empty());
+        const size_t dim = std::max_element(shapes.begin(), shapes.end(),
+                                            [](const MatShape& a, const MatShape& b)
+                                            { return a.size() < b.size(); })->size();
+
+        for (auto& shape : shapes)
+        {
+            shape.insert(shape.begin(), dim - shape.size(), 1);
+        }
+
+        MatShape outShape(dim, 1);
+        for (size_t i = 0; i < dim; ++i)
+        {
+            for (const auto& shape : shapes)
+            {
+                if (shape[i] != outShape[i])
+                {
+                    CV_Assert(shape[i] == 1 || outShape[i] == 1);
+                    outShape[i] = std::max(outShape[i], shape[i]);
+                }
+            }
+        }
+
+        return outShape;
+    }
+
+    static bool prepare_for_broadcast_op(
+        int narrays, int max_ndims, const size_t* elemsize,
+        const int* ndims, const int** shape_, const size_t** step_,
+        int** shape, size_t** step)
+    {
+        int i, j, k;
+
+        // step 1.
+        // * make all inputs and the output max_ndims-dimensional.
+        // ** prepend dimension 1 to the mat of less dims
+        // * compute proper step's
+        for (i = max_ndims-1; i >= 0; i-- ) {
+            for (k = 0; k < narrays; k++) {
+                j = ndims[k] - (max_ndims - i);
+                int sz_i = j >= 0 ? shape_[k][j] : 1;
+                size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] :
+                    i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1];
+                assert(st_i % elemsize[k] == 0);
+                shape[k][i] = sz_i;
+                step[k][i] = st_i;
+                if (shape[k][i] == 0)
+                    return false;
+            }
+        }
+
+        // step 3. Let's do the flattening first,
+        // since we'd need proper values of steps to check continuity.
+        // this loop is probably the most tricky part
+        // in the whole implementation of broadcasting.
+        j = max_ndims-1;
+        for (i = j - 1; i >= 0; i--) {
+            bool all_contiguous = true, all_scalars = true, all_consistent = true;
+            for(k = 0; k < narrays; k++) {
+                size_t st = step[k][j]*shape[k][j];
+                bool prev_scalar = shape[k][j] == 1;
+                bool scalar = shape[k][i] == 1;
+                all_contiguous = all_contiguous && (st == step[k][i]);
+                all_scalars = all_scalars && scalar;
+                all_consistent = all_consistent && (scalar == prev_scalar);
+            }
+            if (all_contiguous && (all_consistent || all_scalars)) {
+                for(k = 0; k < narrays; k++)
+                    shape[k][j] *= shape[k][i];
+            } else {
+                j--;
+                if (i < j) {
+                    for(k = 0; k < narrays; k++) {
+                        shape[k][j] = shape[k][i];
+                        step[k][j] = step[k][i];
+                    }
+                }
+            }
+        }
+
+        // step 2. Set some step's to 0's.
+        for (i = max_ndims-1; i >= j; i--) {
+            for (k = 0; k < narrays; k++)
+                step[k][i] = shape[k][i] == 1 ? 0 : step[k][i];
+        }
+        for (; i >= 0; i--) {
+            for (k = 0; k < narrays; k++) {
+                step[k][i] = 0;
+                shape[k][i] = 1;
+            }
+        }
+        return true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        MatShape outShape = findCommonShape(inputs);
+        outputs.assign(1, outShape);
+        return false;
+    }
+
+    template <typename T, typename Functor>
+    void binary_forward_impl(
+            int ndims, const int* shape,
+            const char* data1, const size_t* step1,
+            const char* data2, const size_t* step2,
+            char* data, const size_t* step,
+            const Functor& op)
+    {
+        assert(ndims >= 2);
+        size_t dp1 = step1[ndims-1]/sizeof(T);
+        size_t dp2 = step2[ndims-1]/sizeof(T);
+        size_t dp = step[ndims-1]/sizeof(T);
+        int k, n1 = shape[ndims-1], n2 = shape[ndims-2];
+        size_t plane_idx, nplanes = 1;
+        for (k = 0; k < ndims-2; k++) nplanes *= shape[k];
+
+        for (plane_idx = 0; plane_idx < nplanes; plane_idx++) {
+            const char* ptr1_ = data1;
+            const char* ptr2_ = data2;
+            char* ptr_ = data;
+            size_t idx = plane_idx;
+            for (k = ndims-3; k >= 0; k--) {
+                size_t next_idx = idx/shape[k];
+                int i_k = (int)(idx - next_idx*shape[k]);
+                ptr1_ += i_k*step1[k];
+                ptr2_ += i_k*step2[k];
+                ptr_ += i_k*step[k];
+                idx = next_idx;
+            }
+            for (int i2 = 0; i2 < n2; i2++, ptr1_ += step1[ndims-2],
+                                            ptr2_ += step2[ndims-2],
+                                            ptr_ += step[ndims-2])
+            {
+                const T* ptr1 = (const T*)ptr1_;
+                const T* ptr2 = (const T*)ptr2_;
+                T* ptr = (T*)ptr_;
+                if (dp1 == 1 && dp2 == 1 && dp == 1) {
+                    for(int i1 = 0; i1 < n1; i1++)
+                        ptr[i1] = op(ptr1[i1], ptr2[i1]);
+                } else if (dp1 == 1 && dp2 == 0 && dp == 1){
+                    T x2 = *ptr2;
+                    for(int i1 = 0; i1 < n1; i1++)
+                        ptr[i1] = op(ptr1[i1], x2);
+                } else if (dp1 == 0 && dp2 == 1 && dp == 1){
+                    T x1 = *ptr1;
+                    for(int i1 = 0; i1 < n1; i1++)
+                        ptr[i1] = op(x1, ptr2[i1]);
+                } else {
+                    for(int i1 = 0; i1 < n1; i1++, ptr1 += dp1, ptr2 += dp2, ptr += dp)
+                        *ptr = op(*ptr1, *ptr2);
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Functor>
+    void binary_forward(const Functor& f, const std::vector<Mat>& inputs, std::vector<Mat>& outputs)
+    {
+        const Mat& a = inputs[0];
+        const Mat& b = inputs[1];
+        Mat& out = outputs[0];
+
+        // collect info of inputs and output
+        const int* in_shape[] = {a.size.p, b.size.p};
+        const size_t* in_step[] = {a.step.p, b.step.p};
+        const int* out_shape = out.size.p;
+        const size_t* out_step = out.step.p;
+        const int in_ndims[] = {a.dims, b.dims};
+        int out_ndims = out.dims;
+
+        int max_ndims = std::max(a.dims, std::max(b.dims, out.dims));
+
+        // buf holds the folllowing for a, b & output:
+        //  * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), 3*4 elements in total
+        //  * shape_buf & step_buf, 3*2*max_ndims elements in total
+        //  * all_ndims, 3*1 elements in total
+        //  * all_type_sizes, 3*1 elements in total
+        AutoBuffer<size_t> buf(3 * (2 * max_ndims + 6));
+
+        int** orig_shapes = (int**)(buf.data());
+        int** shapes = orig_shapes + 3;
+        size_t** orig_steps = (size_t**)(shapes + 3);
+        size_t** steps = orig_steps + 3;
+
+        int* shape_buf = (int*)(steps + 3);
+        size_t* step_buf = (size_t*)(shape_buf + 3 * max_ndims);
+
+        int* all_ndims = (int*)(step_buf + 3 * max_ndims);
+        size_t* all_type_sizes = (size_t*)(all_ndims + 3);
+
+        // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
+        for (int i = 0; i < 3; i++)
+        {
+            orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
+            orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
+            shapes[i] = shape_buf + i * max_ndims;
+            steps[i] = step_buf + i * max_ndims;
+            all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
+            all_type_sizes[i] = sizeof(T);
+        }
+
+        if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes,
+                                      all_ndims, (const int**)orig_shapes,
+                                      (const size_t**)orig_steps,
+                                      shapes, steps))
+            return;
+
+        binary_forward_impl<T, Functor>(
+                max_ndims, shapes[0], a.ptr<char>(), steps[1],
+                b.ptr<char>(), steps[2], out.ptr<char>(), steps[0],
+                f);
+    }
+
+    template<typename T, typename Functor>
+    void nary_forward_impl(
+        const Functor& f, const T scale, int ninputs, int ndims, const int* shape,
+        const char** inp, char* out,
+        const size_t** steps, char** ptrs)
+    {
+        CV_Assert(ndims >= 2);
+        size_t dp = steps[0][ndims-1]/sizeof(T);
+        size_t dp1 = steps[1][ndims-1]/sizeof(T);
+        size_t dp2 = steps[2][ndims-1]/sizeof(T);
+
+        CV_Assert(dp == 1);
+        enum { BLOCK_SIZE = 1024 };
+        T blck[BLOCK_SIZE];
+
+        int k, i, di1=0, n1 = shape[ndims-1], n2 = shape[ndims-2];
+        int second = ninputs == 1 ? 1 : 2;
+        size_t plane_idx, nplanes = 1;
+        for (k = 0; k < ndims-2; k++) nplanes *= shape[k];
+
+        for (plane_idx = 0; plane_idx < nplanes; plane_idx++) {
+            ptrs[0] = out;
+            for (i = 0; i < ninputs; i++) ptrs[i+1] = (char*)inp[i];
+            size_t idx = plane_idx;
+            for (k = ndims-3; k >= 0; k--) {
+                size_t next_idx = idx/shape[k];
+                int i_k = (int)(idx - next_idx*shape[k]);
+                for (i = 0; i < ninputs; i++)
+                    ptrs[i] += i_k*steps[i][k];
+                idx = next_idx;
+            }
+            for (int i2 = 0; i2 < n2; i2++)
+            {
+                const T* ptr1 = (const T*)(ptrs[1] + steps[1][ndims-2]*i2);
+                const T* ptr2 = (const T*)(ptrs[second] + steps[second][ndims-2]*i2);
+                T* ptr = (T*)(ptrs[0] + steps[0][ndims-2]*i2);
+                if (ninputs <= 2) {
+                    if (dp1 == 1 && dp2 == 1) {
+                        for (int i1 = 0; i1 < n1; i1++)
+                            ptr[i1] = saturate_cast<T>(f(ptr1[i1], ptr2[i1])*scale);
+                    } else {
+                        for(int i1 = 0; i1 < n1; i1++, ptr1 += dp1, ptr2 += dp2, ptr += dp)
+                            *ptr = saturate_cast<T>(f(*ptr1, *ptr2)*scale);
+                    }
+                } else {
+                    for (int i1 = 0; i1 < n1; i1 += di1, ptr += di1) {
+                        di1 = BLOCK_SIZE < n1-i1 ? BLOCK_SIZE : n1-i1;
+                        if (dp1 == 1 && dp2 == 1) {
+                            for (int j = 0; j < di1; j++)
+                                blck[j] = f(ptr1[j], ptr2[j]);
+                            ptr1 += di1;
+                            ptr2 += di1;
+                        } else {
+                            for(int j = 0; j < di1; j++, ptr1 += dp1, ptr2 += dp2)
+                                blck[j] = f(*ptr1, *ptr2);
+                        }
+                        for(i = 2; i < ninputs; i++) {
+                            int dp_i = steps[i+1][ndims-1]/sizeof(T);
+                            const T* ptr_i = (const T*)(ptrs[i+1] +
+                                    steps[i+1][ndims-2]*i2) + i1*dp_i;
+                            if (dp_i == 1) {
+                                if (i < ninputs-1) {
+                                    for (int j = 0; j < di1; j++)
+                                        blck[j] = f(blck[j], ptr_i[j]);
+                                } else {
+                                    for (int j = 0; j < di1; j++)
+                                        ptr[j] = saturate_cast<T>(f(blck[j], ptr_i[j]) * scale);
+                                }
+                            } else {
+                                if (i < ninputs-1) {
+                                    for (int j = 0; j < di1; j++, ptr_i += dp_i)
+                                        blck[j] = f(blck[j], *ptr_i);
+                                } else {
+                                    for (int j = 0; j < di1; j++, ptr_i += dp_i)
+                                        ptr[j] = saturate_cast<T>(f(blck[j], *ptr_i) * scale);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Functor>
+    void nary_forward(
+        const Functor& f, T scale,
+        const std::vector<Mat>& inputs, std::vector<Mat>& outputs
+        )
+    {
+        int ninputs = inputs.size();
+
+        // collect all input
+        std::vector<const char*> v_inp;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr<const char>(); });
+        const char** inp = v_inp.data();
+
+        // collect ndims of all input
+        std::vector<int> v_inp_dims;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
+        const int* inp_ndims = v_inp_dims.data();
+
+        // collect shapes of all input
+        std::vector<const int*> v_inp_shape;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; });
+        const int** inp_shape = v_inp_shape.data();
+
+        // collect steps of all input
+        std::vector<const size_t*> v_inp_step;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; });
+        const size_t** inp_step = v_inp_step.data();
+
+        // collect info of output (ndims, shape, step)
+        char* out = outputs[0].ptr<char>();
+        int out_ndims = outputs[0].dims;
+        const int* out_shape = outputs[0].size.p;
+        const size_t* out_step = outputs[0].step.p;
+
+        // find max ndims for broadcasting
+        int i, max_ndims = out_ndims > 2 ? out_ndims : 2;
+        for(i = 0; i < ninputs; i++)
+            max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
+
+        // buf holds the following buffers for inputs & output:
+        //  * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total
+        //  * ptrs, (ninputs+1)*1 elements in total
+        //  * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total
+        //  * all_ndims, (ninputs+1)*1 elements in total
+        //  * all_type_sizes, (ninputs+1)*1 elements in total
+        AutoBuffer<size_t> buf((ninputs + 1) * (2 * max_ndims + 7));
+
+        int** orig_shapes = (int**)buf.data();
+        int** shapes = orig_shapes + ninputs + 1;
+        size_t** orig_steps = (size_t**)(shapes + ninputs + 1);
+        size_t** steps = orig_steps + ninputs + 1;
+
+        char** ptrs = (char**)(steps + ninputs + 1);
+
+        size_t* step_buf = (size_t*)(ptrs + ninputs + 1);
+        int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims);
+
+        int* all_ndims = shape_buf + (ninputs + 1)*max_ndims;
+        size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1);
+
+        for(i = 0; i <= ninputs; i++) {
+            all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1];
+            all_type_sizes[i] = sizeof(T);
+            orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0);
+            orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0);
+            shapes[i] = shape_buf + max_ndims*i;
+            steps[i] = step_buf + max_ndims*i;
+        }
+
+        if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes,
+                                      all_ndims, (const int**)orig_shapes,
+                                      (const size_t**)orig_steps,
+                                      shapes, steps))
+            return;
+
+        nary_forward_impl<T>(
+                f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        // TODO: assert types
+        typeDispatch(outputs[0].type(), inputs.size(), inputs, outputs);
+    }
+
+    template<typename T, typename... Args>
+    inline void opDispatch(size_t ninputs, Args&&... args)
+    {
+        switch (op)
+        {
+            case OPERATION::EQUAL:
+            {
+                auto equal = [](const T &a, const T &b) { return a == b; };
+                binary_forward<T>(equal, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::GREATER:
+            {
+                auto greater = [](const T &a, const T &b) { return a > b; };
+                binary_forward<T>(greater, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::GREATER_EQUAL:
+            {
+                auto greater_equal = [](const T &a, const T &b) { return a >= b; };
+                binary_forward<T>(greater_equal, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::LESS:
+            {
+                auto less = [](const T &a, const T &b) { return a < b; };
+                binary_forward<T>(less, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::LESS_EQUAL:
+            {
+                auto less_equal = [](const T &a, const T &b) { return a <= b; };
+                binary_forward<T>(less_equal, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::POW:
+            {
+                auto pow = [] (const T& a, const T& b) { return std::pow(a, b); };
+                binary_forward<T>(pow, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::BITSHIFT:
+            {
+                auto bitshift = [] (const uint8_t &a, const uint8_t &b) { return a << b; };
+                binary_forward<T>(bitshift, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::MAX:
+            {
+                auto max = [](const T &a, const T &b) { return std::max(a, b); };
+                nary_forward<T>(max, T{1}, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::MEAN:
+            {
+                auto mean = [](const T &a, const T &b) { return (a + b) / T{2}; };
+                nary_forward<T>(mean, T{1} / ninputs, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::MIN:
+            {
+                auto min = [](const T &a, const T &b) { return std::min(a, b); };
+                nary_forward<T>(min, T{1}, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::MOD:
+            {
+                auto mod = [](const uint8_t &a, const uint8_t &b) { return a % b; };
+                binary_forward<T>(mod, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::PROD:
+            {
+                auto prod = [](const T &a, const T &b) { return a * b; };
+                binary_forward<T>(prod, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::SUB:
+            {
+                auto sub = [](const T &a, const T &b) { return a - b; };
+                binary_forward<T>(sub, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::SUM:
+            {
+                auto sum = [](const T &a, const T &b) { return a + b; };
+                nary_forward<T>(sum, T{1}, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::ADD:
+            {
+                auto add = [](const T &a, const T &b) { return a + b; };
+                binary_forward<T>(add, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::DIV:
+            {
+                auto div = [](const T &a, const T &b) { return a / b; };
+                binary_forward<T>(div, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::AND:
+            {
+                auto op_and = [](const uint8_t &a, const uint8_t &b) { return a & b; };
+                binary_forward<T>(op_and, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::OR:
+            {
+                auto op_or = [](const uint8_t &a, const uint8_t &b) { return a | b; };
+                binary_forward<T>(op_or, std::forward<Args>(args)...);
+                break;
+            }
+            case OPERATION::XOR:
+            {
+                auto op_xor = [](const uint8_t &a, const uint8_t &b) { return a ^ b; };
+                binary_forward<T>(op_xor, std::forward<Args>(args)...);
+                break;
+            }
+            default:
+                CV_Error(Error::StsBadArg, "Unsupported operation.");
+        };
+    }
+
+    template<typename... Args>
+    inline void typeDispatch(const int type, Args&&... args)
+    {
+        switch (type)
+        {
+            case CV_8U:
+                opDispatch<uint8_t>(std::forward<Args>(args)...);
+                break;
+            case CV_32S:
+                opDispatch<int32_t>(std::forward<Args>(args)...);
+                break;
+            case CV_32F:
+                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::MOD &&
+                          op != OPERATION::AND && op != OPERATION::OR &&
+                          op != OPERATION::XOR);
+                opDispatch<float>(std::forward<Args>(args)...);
+                break;
+            default:
+                CV_Error(cv::Error::BadDepth, "Unsupported type.");
+        };
+    }
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return false;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size());
+        return inputs.size() * total(outputs[0]);
+    }
+};
+
+Ptr<NaryEltwiseLayer> NaryEltwiseLayer::create(const LayerParams& params)
+{
+    return Ptr<NaryEltwiseLayer>(new NaryEltwiseLayerImpl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index ebbda98d51..e99fbb319e 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -63,10 +63,17 @@ class ONNXImporter
         LayerInfo(int _layerId = 0, int _outputId = 0) : layerId(_layerId), outputId(_outputId) {}
     };
 
+    struct TensorInfo {
+        int real_ndims;
+        TensorInfo(int _real_ndims = 0) : real_ndims(_real_ndims) {}
+    };
+
     std::map<std::string, Mat> getGraphTensors(
                                     const opencv_onnx::GraphProto& graph_proto);
     Mat getBlob(const opencv_onnx::NodeProto& node_proto, int index);
     Mat getBlob(const std::string& input_name);
+    TensorInfo getBlobExtraInfo(const opencv_onnx::NodeProto& node_proto, int index);
+    TensorInfo getBlobExtraInfo(const std::string& input_name);
 
     LayerParams getLayerParams(const opencv_onnx::NodeProto& node_proto);
 
@@ -101,6 +108,7 @@ protected:
     std::string framework_name;
 
     std::map<std::string, Mat> constBlobs;
+    std::map<std::string, TensorInfo> constBlobsExtraInfo;
 
     std::map<std::string, MatShape> outShapes;  // List of internal blobs shapes.
     bool hasDynamicShapes;  // Whether the model has inputs with dynamic shapes
@@ -134,9 +142,6 @@ private:
     void parseReduce               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseSlice                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseSplit                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
-    void parseBias                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
-    void parsePow                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
-    void parseMinMax               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseNeg                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseConstant             (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseLSTM                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
@@ -148,14 +153,12 @@ private:
     void parseElu                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseTanh                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseAbs                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
-    void parseCompare              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parsePRelu                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseLRN                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseInstanceNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseBatchNormalization   (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseGemm                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseMatMul               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
-    void parseMul                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseConv                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseConvTranspose        (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseTranspose            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
@@ -175,6 +178,7 @@ private:
     void parseSoftMax              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseDetectionOutput      (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseCumSum               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseElementWise          (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseDepthToSpace         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseSimpleLayers         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
 
@@ -399,6 +403,7 @@ std::map<std::string, Mat> ONNXImporter::getGraphTensors(
             continue;
 
         layers_weights.insert(std::make_pair(tensor_proto.name(), mat));
+        constBlobsExtraInfo.insert(std::make_pair(tensor_proto.name(), TensorInfo(tensor_proto.dims_size())));
     }
     return layers_weights;
 }
@@ -506,6 +511,7 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
                 opencv_onnx::TensorProto tensor = attribute_proto.t();
                 Mat blob = getMatFromTensor(tensor);
                 lp.blobs.push_back(blob);
+                lp.set("original_dims_of_mat", tensor.dims_size());
             }
             else if (attribute_proto.has_g())
             {
@@ -573,6 +579,23 @@ Mat ONNXImporter::getBlob(const std::string& input_name)
     return constBlob->second;
 }
 
+ONNXImporter::TensorInfo ONNXImporter::getBlobExtraInfo(const opencv_onnx::NodeProto &node_proto, int index)
+{
+    CV_Assert(index < node_proto.input_size());
+    const std::string& input_name = node_proto.input(index);
+    return getBlobExtraInfo(input_name);
+}
+
+ONNXImporter::TensorInfo ONNXImporter::getBlobExtraInfo(const std::string& input_name)
+{
+    std::map<std::string, TensorInfo>::const_iterator constBlobExtraInfo = constBlobsExtraInfo.find(input_name);
+    if (constBlobExtraInfo == constBlobsExtraInfo.end())
+    {
+        CV_Error(Error::StsBadArg, std::string("Blob ") + input_name + " not found in const blobs of extra info");
+    }
+    return constBlobExtraInfo->second;
+}
+
 void ONNXImporter::addLayer(LayerParams& layerParams,
                             const opencv_onnx::NodeProto& node_proto)
 {
@@ -1429,145 +1452,6 @@ void ONNXImporter::parseSplit(LayerParams& layerParams, const opencv_onnx::NodeP
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseBias(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
-{
-    opencv_onnx::NodeProto node_proto = node_proto_;
-    const std::string& layer_type = node_proto.op_type();
-    bool isSub = layer_type == "Sub";
-
-    if (layer_type == "Sum" && node_proto.input_size() == 1)
-    {
-        layerParams.type = "Identity";
-        addLayer(layerParams, node_proto);
-        return;
-    }
-
-    CV_Assert((node_proto.input_size() == 2) || (layer_type == "Sum" && node_proto.input_size() > 2));
-
-    if (layer_type == "Sum" && node_proto.input_size() > 2)
-    {
-        for (int i = 0; i < node_proto.input_size(); ++i)
-        {
-            if (layer_id.find(node_proto.input(i)) == layer_id.end())
-            {
-                CV_Error(Error::StsNotImplemented, "Sum of constants is not implemented for inputs > 2");
-            }
-        }
-    }
-
-    bool is_const_0 = layer_id.find(node_proto.input(0)) == layer_id.end();
-    bool is_const_1 = layer_id.find(node_proto.input(1)) == layer_id.end();
-    if (is_const_0 && is_const_1)
-    {
-        Mat blob_0 = getBlob(node_proto, 0);
-        Mat blob_1 = getBlob(node_proto, 1);
-        CV_Assert(blob_0.size == blob_1.size);
-        Mat output = isSub ? (blob_0 - blob_1) : (blob_0 + blob_1);
-        addConstant(node_proto.output(0), output);
-        return;
-    }
-    else if (is_const_0 || is_const_1)
-    {
-        int const_blob_id = is_const_0 ? 0 : 1;
-        int input_id = 1 - const_blob_id;
-        Mat blob = getBlob(node_proto, const_blob_id);
-        int blob_total = blob.total();
-
-        const float inputScale = isSub && is_const_0 ? -1.f : 1.f;
-        const float constScale = isSub && is_const_1 ? -1.f : 1.f;
-
-        if (blob_total == 1) {
-            layerParams.type = "Power";
-            layerParams.set("scale", inputScale);
-            layerParams.set("shift", constScale * blob.ptr<float>()[0]);
-        }
-        else {
-            MatShape inpShape = outShapes[node_proto.input(input_id)];
-            if (shape(blob) == inpShape)
-            {
-                LayerParams constParams;
-                constParams.name = layerParams.name + "/const";
-                constParams.type = "Const";
-                constParams.blobs.push_back(blob);
-                int id = dstNet.addLayer(constParams.name, constParams.type, constParams);
-                layer_id.insert(std::make_pair(constParams.name, LayerInfo(id, 0)));
-                outShapes[constParams.name] = shape(blob);
-
-                layerParams.type = "Eltwise";
-                float coeffs[] = {1., isSub ? -1.f : 1.f};
-                layerParams.set("coeff", DictValue::arrayReal<float*>(coeffs, 2));
-                node_proto.set_input(const_blob_id, constParams.name);
-            }
-            else
-            {
-                if (inputScale < 0.f)
-                {
-                    addNegation(layerParams, node_proto, input_id);
-                }
-
-                layerParams.type = "Scale";
-                layerParams.set("bias_term", true);
-                int axis = 1;
-                for (int i = 0; i < graph_proto.initializer_size(); i++)
-                {
-                    opencv_onnx::TensorProto tensor_proto = graph_proto.initializer(i);
-                    if (tensor_proto.name() == node_proto.input(const_blob_id))
-                    {
-                        axis = inpShape.size() - tensor_proto.dims_size();
-                        break;
-                    }
-                }
-                layerParams.set("axis", axis);
-                blob = blob.reshape(1, 1);
-                layerParams.blobs.push_back(constScale * blob);
-            }
-        }
-    }
-    else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
-    {
-        layerParams.type = "Eltwise";
-        if (isSub)
-        {
-            static float subCoeffs[] = {1.f, -1.f};
-            layerParams.set("coeff", DictValue::arrayReal<float*>(subCoeffs, 2));
-        }
-    }
-    else
-    {
-        if (isSub)
-        {
-            addNegation(layerParams, node_proto, 1);
-        }
-        layerParams.type = "Scale";
-        layerParams.set("bias_term", true);
-    }
-    addLayer(layerParams, node_proto);
-}
-
-void ONNXImporter::parsePow(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
-{
-    if (layer_id.find(node_proto.input(1)) != layer_id.end())
-        CV_Error(Error::StsNotImplemented, "Unsupported Pow op with variable power");
-
-    Mat blob = getBlob(node_proto, 1);
-    if (blob.total() != 1)
-        CV_Error(Error::StsNotImplemented, "Pow op supports only scalar power");
-
-    blob.convertTo(blob, CV_32F);
-    layerParams.type = "Power";
-    layerParams.set("power", blob.ptr<float>()[0]);
-    addLayer(layerParams, node_proto);
-}
-
-// "Min" "Max"
-void ONNXImporter::parseMinMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
-{
-    const std::string& layer_type = node_proto.op_type();
-    layerParams.type = "Eltwise";
-    layerParams.set("operation", layer_type == "Max" ? "max" : "min");
-    addLayer(layerParams, node_proto);
-}
-
 void ONNXImporter::parseNeg(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     layerParams.type = "Power";
@@ -1580,6 +1464,12 @@ void ONNXImporter::parseConstant(LayerParams& layerParams, const opencv_onnx::No
     CV_Assert(node_proto.input_size() == 0);
     CV_Assert(layerParams.blobs.size() == 1);
     addConstant(node_proto.output(0), layerParams.blobs[0]);
+    // add constant for constBlobsExtraInfo
+    if (layerParams.has("original_dims_of_mat"))
+    {
+        int original_dims_of_mat = layerParams.get<int>("original_dims_of_mat");
+        constBlobsExtraInfo.insert(std::make_pair(node_proto.output(0), TensorInfo(original_dims_of_mat)));
+    }
 }
 
 void transformBlobs(std::vector<Mat>& blobs)
@@ -1988,32 +1878,6 @@ void ONNXImporter::parseAbs(LayerParams& layerParams, const opencv_onnx::NodePro
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseCompare(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
-{
-    CV_Assert(node_proto.input_size() == 2);
-    const std::string& layer_type = node_proto.op_type();
-
-    bool is_const_0 = layer_id.find(node_proto.input(0)) == layer_id.end();
-    bool is_const_1 = layer_id.find(node_proto.input(1)) == layer_id.end();
-
-    if (is_const_0 || is_const_1)
-    {
-        Mat blob = getBlob(node_proto, static_cast<int>(is_const_1));
-        blob = blob.reshape(1, 1);
-        layerParams.blobs.push_back(blob);
-    }
-
-    layerParams.type = "Compare";
-
-    if (layer_type == "Equal")
-        layerParams.set("mode", "equal");
-    else if (layer_type == "Greater")
-        layerParams.set("mode", "greater");
-    else
-        layerParams.set("mode", "less");
-    addLayer(layerParams, node_proto);
-}
-
 void ONNXImporter::parsePRelu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     layerParams.type = "PReLU";
@@ -2189,169 +2053,6 @@ void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t&
     axis += diff;
 }
 
-// "Mul" "Div"
-void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
-{
-    opencv_onnx::NodeProto node_proto = node_proto_;
-    const std::string& layer_type = node_proto.op_type();
-    const std::string output_name = node_proto.output(0);
-    CV_Assert(node_proto.input_size() == 2);
-
-    bool isDiv = layer_type == "Div";
-    int constId = -1;
-    bool haveVariables = false;
-    for (int i = 0; i < 2; ++i)
-    {
-        if (constBlobs.find(node_proto.input(i)) != constBlobs.end())
-            constId = i;
-        else
-            haveVariables = true;
-    }
-    if (constId != -1 && haveVariables)
-    {
-        Mat blob = getBlob(node_proto, constId);
-        blob = blob.reshape(1, 1);
-        if (blob.total() == 1) {
-            float blob_value = blob.ptr<float>()[0];
-            float coeff = blob_value;
-            if (isDiv)
-            {
-                coeff = 1.f / blob_value;
-                if (constId == 0)
-                {
-                    // Power layer calculates (x*scale + shift)^power, so const/x -> (x * (1/const) + 0)^(-1)
-                    layerParams.set("power", -1.f);
-                }
-            }
-            layerParams.set("scale", coeff);
-            layerParams.type = "Power";
-        }
-        else {
-            if (isDiv)
-                divide(1.0, blob, blob);
-            layerParams.blobs.push_back(blob);
-            layerParams.type = "Scale";
-        }
-    }
-    else if (!haveVariables)
-    {
-        Mat inp0 = getBlob(node_proto, 0);
-        Mat inp1 = getBlob(node_proto, 1);
-
-        if (inp0.size != inp1.size && (inp0.total() != 1 || inp1.total() != 1))
-            CV_Error_(Error::StsNotImplemented, ("Different shapes case is not supported with constant inputs: %s", layer_type.c_str()));
-
-        if (inp0.total() == 1 && inp1.total() == 1 && inp0.dims != inp1.dims)
-        {
-            if (inp0.dims < inp1.dims)
-            {
-                inp0 = inp0.reshape(1, inp1.dims, inp1.size);
-                inp0.dims = inp1.dims;
-            }
-            else
-            {
-                inp1 = inp1.reshape(1, inp0.dims, inp0.size);
-                inp1.dims = inp0.dims;
-            }
-        }
-
-        Mat out;
-        if (inp0.total() != inp1.total())
-        {
-            if (inp0.total() == 1)
-            {
-                float inp0_value = inp0.ptr<float>()[0];
-                float coeff = isDiv ? 1.0 / inp0_value : inp0_value;
-                multiply(inp1, coeff, out);
-            }
-            else
-            {
-                float inp1_value = inp1.ptr<float>()[0];
-                float coeff = isDiv ? 1.0 / inp1_value : inp1_value;
-                multiply(inp0, coeff, out);
-            }
-
-        }
-        else
-        {
-            out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
-        }
-
-        if (inp0.dims == 1 && inp1.dims == 1)
-            out.dims = 1;  // to workaround dims == 1
-        addConstant(output_name, out);
-        return;
-    }
-    else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
-    {
-        layerParams.type = "Eltwise";
-        layerParams.set("operation", isDiv ? "div" : "prod");
-    }
-    else
-    {
-        // Scale layer allocate output with the first input shape
-        if (total(outShapes[node_proto.input(0)]) < total(outShapes[node_proto.input(1)]))
-        {
-            opencv_onnx::NodeProto proto;
-            proto.add_input(node_proto.input(1));
-            proto.add_input(node_proto.input(0));
-            proto.add_output(output_name);
-            node_proto = proto;
-        }
-
-        if (isDiv)
-        {
-            LayerParams powerParams;
-            powerParams.name = layerParams.name + "/inv";
-            powerParams.type = "Power";
-            powerParams.set("power", -1);
-
-            //Create Power layer
-            int id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
-            //Connect to input
-            IterLayerId_t layerId = layer_id.find(node_proto.input(1));
-            CV_Assert(layerId != layer_id.end());
-            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-            //Add shape
-            layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
-            outShapes[powerParams.name] = outShapes[node_proto.input(1)];
-
-            //Replace input to Power
-            node_proto.set_input(1, powerParams.name);
-        }
-
-        const MatShape& broadShape = outShapes[node_proto.input(1)];
-        const MatShape& outShape = outShapes[node_proto.input(0)];
-
-        size_t axis = 0;
-        int broadAxis = -1;
-        findBroadAxis(broadShape, outShape, axis, broadAxis);
-
-        // if there is a one dimension in the middle that should be broadcasted, broadcast it
-        if (broadAxis != -1)
-        {
-            opencv_onnx::NodeProto concat_node_proto = node_proto;
-            const std::string& input1 = concat_node_proto.input(1);
-
-            expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
-
-            LayerParams concatLP;
-            concatLP.name = layerParams.name + "/concat";
-            concatLP.set("axis", broadAxis);
-            concatLP.type = "Concat";
-            concat_node_proto.set_output(0, concatLP.name);
-
-            addLayer(concatLP, concat_node_proto);
-            node_proto.set_input(1, concatLP.name);
-        }
-
-        CV_Assert(axis != outShape.size());
-        layerParams.set("axis", static_cast<int>(axis));
-        layerParams.type = "Scale";
-    }
-    addLayer(layerParams, node_proto);
-}
-
 void ONNXImporter::parseConv(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
     opencv_onnx::NodeProto node_proto = node_proto_;
@@ -2542,6 +2243,10 @@ void ONNXImporter::parseFlatten(LayerParams& layerParams, const opencv_onnx::Nod
     if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
     {
         Mat input = getBlob(node_proto, 0);
+        if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end())
+        {
+            constBlobsExtraInfo.insert(std::make_pair(node_proto.output(0), getBlobExtraInfo(node_proto, 0)));
+        }
         int axis = normalize_axis(axis_, input.dims);
 
         int out_size[2] = {1, 1};
@@ -2614,12 +2319,16 @@ void ONNXImporter::parseUnsqueeze(LayerParams& layerParams, const opencv_onnx::N
     {
         // Constant input.
         Mat input = getBlob(node_proto, 0);
+        int input_dims = input.dims;
+        if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end())
+            if (getBlobExtraInfo(node_proto, 0).real_ndims == 1)
+                input_dims = 1;
 
         std::vector<int> dims;
-        for (int j = 0; j < input.dims; j++) {
+        for (int j = 0; j < input_dims; j++) {
             dims.push_back(input.size[j]);
         }
-        CV_Assert(axes.getIntValue(axes.size()-1) <= dims.size());
+//        CV_Assert(axes.getIntValue(axes.size()-1) <= dims.size());
         for (int j = 0; j < axes.size(); j++) {
             const int idx = axes.getIntValue(j);
             CV_Assert(idx <= dims.size());
@@ -2874,6 +2583,10 @@ void ONNXImporter::parseCast(LayerParams& layerParams, const opencv_onnx::NodePr
     if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
     {
         Mat blob = getBlob(node_proto, 0);
+        if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end())
+        {
+            constBlobsExtraInfo.insert(std::make_pair(node_proto.output(0), getBlobExtraInfo(node_proto, 0)));
+        }
         int type;
         switch (layerParams.get<int>("to"))
         {
@@ -3011,6 +2724,10 @@ void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::Node
             break;
         }
     }
+    if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end())
+    {
+        constBlobsExtraInfo.insert(std::make_pair(node_proto.output(0), getBlobExtraInfo(node_proto, 0)));
+    }
 
     if (!hasVariableInps)
     {
@@ -3223,6 +2940,78 @@ void ONNXImporter::parseCumSum(LayerParams& layerParams, const opencv_onnx::Node
     addLayer(layerParams, node_proto);
 }
 
+// "Equal" "Greater" "Less" "Pow" "Add" "Sub" "Mul" "Div" "Sum" "Min" "Max"
+void ONNXImporter::parseElementWise(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    String op_type = toLowerCase(node_proto.op_type());
+
+    layerParams.type = "NaryEltwise";
+    layerParams.set("operation", toLowerCase(node_proto.op_type()));
+
+    // element-wise layers that can have >=1 inputs but actually have one input
+    if (node_proto.input_size() == 1 && (op_type == "max" || op_type == "min" || op_type == "mean" || op_type == "sum"))
+    {
+        layerParams.type = "Identity";
+        addLayer(layerParams, node_proto);
+        return;
+    }
+
+    auto pre_broadcast_transform = [](Mat& t, int t_real_ndims) {
+        if (t.dims == 2 && t_real_ndims == 1 && t.size[1] == 1)
+            transpose(t, t);
+    };
+
+    size_t consts = 0;
+    for (size_t i = 0; i < node_proto.input_size(); ++i)
+    {
+        if (layer_id.find(node_proto.input(i)) == layer_id.end())
+        {
+            ++consts;
+        }
+    }
+
+    if (consts == node_proto.input_size())
+    {
+        std::vector<Mat> inputs, output;
+        for (size_t i = 0; i < node_proto.input_size(); ++i)
+        {
+            inputs.push_back(getBlob(node_proto, i));
+        }
+        runLayer(layerParams, inputs, output);
+        CV_Assert(output.size() == 1);
+        addConstant(node_proto.output(0), output[0]);
+        return;
+    }
+    else if (consts > 0)
+    {
+        for (size_t i = 0; i < node_proto.input_size(); ++i)
+        {
+            if (layer_id.find(node_proto.input(i)) == layer_id.end())
+            {
+                Mat inp = getBlob(node_proto, i);
+                // for cases like a tensor of shape (2,), it will be loaded as shape (2, 1) in OpenCV Mat,
+                // but for correct broadcast, we need to make it of shape (1, 2)
+                if (constBlobsExtraInfo.find(node_proto.input(i)) != constBlobsExtraInfo.end())
+                    pre_broadcast_transform(inp, getBlobExtraInfo(node_proto, i).real_ndims);
+
+                // carry the constant by adding a Const node
+                LayerParams constParams;
+                constParams.name = node_proto.input(i);
+                constParams.type = "Const";
+                constParams.blobs.push_back(inp);
+
+                opencv_onnx::NodeProto proto;
+                proto.add_output(constParams.name);
+                addLayer(constParams, proto);
+            }
+        }
+    }
+
+    // add element-wise layer
+    addLayer(layerParams, node_proto);
+}
+
 void ONNXImporter::parseDepthToSpace(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
     // We parse "DepthToSpace" and "SpaceToDepth" in this function.
@@ -3794,9 +3583,6 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
             dispatch["ReduceL2"] = dispatch["ReduceLogSum"] = dispatch["ReduceLogSumExp"] = &ONNXImporter::parseReduce;
     dispatch["Slice"] = &ONNXImporter::parseSlice;
     dispatch["Split"] = &ONNXImporter::parseSplit;
-    dispatch["Add"] = dispatch["Sum"] = dispatch["Sub"] = &ONNXImporter::parseBias;
-    dispatch["Pow"] = &ONNXImporter::parsePow;
-    dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseMinMax;
     dispatch["Neg"] = &ONNXImporter::parseNeg;
     dispatch["Constant"] = &ONNXImporter::parseConstant;
     dispatch["LSTM"] = &ONNXImporter::parseLSTM;
@@ -3808,14 +3594,12 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["Elu"] = &ONNXImporter::parseElu;
     dispatch["Tanh"] = &ONNXImporter::parseTanh;
     dispatch["Abs"] = &ONNXImporter::parseAbs;
-    dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = &ONNXImporter::parseCompare;
     dispatch["PRelu"] = &ONNXImporter::parsePRelu;
     dispatch["LRN"] = &ONNXImporter::parseLRN;
     dispatch["InstanceNormalization"] = &ONNXImporter::parseInstanceNormalization;
     dispatch["BatchNormalization"] = &ONNXImporter::parseBatchNormalization;
     dispatch["Gemm"] = &ONNXImporter::parseGemm;
     dispatch["MatMul"] = &ONNXImporter::parseMatMul;
-    dispatch["Mul"] = dispatch["Div"] = &ONNXImporter::parseMul;
     dispatch["Conv"] = &ONNXImporter::parseConv;
     dispatch["ConvTranspose"] = &ONNXImporter::parseConvTranspose;
     dispatch["Transpose"] = &ONNXImporter::parseTranspose;
@@ -3837,6 +3621,10 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["CumSum"] = &ONNXImporter::parseCumSum;
     dispatch["SpaceToDepth"] = dispatch["DepthToSpace"] = &ONNXImporter::parseDepthToSpace;
 
+    dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] =
+            dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = &ONNXImporter::parseElementWise;
+    dispatch["Sum"] = dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseElementWise;
+
     std::vector<std::string> simpleLayers{"Acos", "Acosh", "Asin", "Asinh", "Atan", "Atanh", "Ceil", "Celu", "Cos",
                                           "Cosh", "Dropout", "Erf", "Exp", "Floor", "HardSigmoid", "HardSwish",
                                           "Identity", "Log", "Round", "Reciprocal", "Selu", "Sign", "Sigmoid", "Sin", "Sinh", "Softmax",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
index dd0965904a..292cd2a066 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
@@ -55,3 +55,5 @@
 "test_sub_bcast",
 "test_sub_uint8",  // output type mismatch
 "test_upsample_nearest",
+"test_div_bcast", // remove when 1D Mat is supported
+"test_mul_bcast", // remove when 1D Mat is supported
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp
index e69de29bb2..8b13789179 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp
@@ -0,0 +1 @@
+

From 697acf7f6a1e96fc7df4586f7900da9b863c3ff8 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 19 Jul 2022 13:10:05 +0300
Subject: [PATCH 135/178] Linux ARM64 rename ubuntu version on 3.4

---
 .github/workflows/PR-3.4.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index fcfa924e96..8d954a26cc 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -6,7 +6,7 @@ on:
       - 3.4
 
 jobs:
-  Ubuntu1804-ARM64:
+  Ubuntu2004-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-ARM64.yaml@main
 
   Ubuntu2004-x64:

From 65c173f2a398a5e1e46e237646fb31bdbdc45431 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 19 Jul 2022 13:12:50 +0300
Subject: [PATCH 136/178] Linux ARM64 rename ubuntu version on 4.x

---
 .github/workflows/PR-4.x.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 44970006f9..e5e624a35e 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -6,7 +6,7 @@ on:
       - 4.x
 
 jobs:
-  Ubuntu1804-ARM64:
+  Ubuntu2004-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-ARM64.yaml@main
 
   Ubuntu2004-x64:

From 1feabf42755a3c6011674b40aed9d735048c97fd Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Tue, 19 Jul 2022 13:52:29 +0300
Subject: [PATCH 137/178] Fixed an issue with a recursion of cv2 in python

---
 modules/python/package/cv2/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/python/package/cv2/__init__.py b/modules/python/package/cv2/__init__.py
index f8e631382c..086fc48dec 100644
--- a/modules/python/package/cv2/__init__.py
+++ b/modules/python/package/cv2/__init__.py
@@ -40,7 +40,7 @@ def bootstrap():
     BINARIES_PATHS = []
 
     g_vars = globals()
-    l_vars = locals()
+    l_vars = locals().copy()
 
     if sys.version_info[:2] < (3, 0):
         from . load_config_py2 import exec_file_wrapper

From 14aa9eaadd299600229f71cb25862f737a38a8c3 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Tue, 19 Jul 2022 21:44:07 +0900
Subject: [PATCH 138/178] doc: fix layout

---
 .../introduction/config_reference/config_reference.markdown    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 388b0cc7c1..81607b0086 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -205,11 +205,12 @@ cmake -DCV_DISABLE_OPTIMIZATION=ON ../opencv
 More details on CPU optimization options can be found in wiki: https://github.com/opencv/opencv/wiki/CPU-optimizations-build-options
 
 
-## Profiling, coverage, sanitize, hardening, size optimization
+## Profiling, coverage, sanitize, hardening, size optimization {#profiling_coverage_sanitize_hardening_size_optimization}
 
 Following options can be used to produce special builds with instrumentation or improved security. All options are disabled by default.
 
 | Option | Compiler | Description |
+| -------| -------- | ----------- |
 | `ENABLE_PROFILING` | GCC or Clang | Enable profiling compiler and linker options. |
 | `ENABLE_COVERAGE` | GCC or Clang | Enable code coverage support. |
 | `OPENCV_ENABLE_MEMORY_SANITIZER` | N/A | Enable several quirks in code to assist memory sanitizer. |

From 0ef803950bbd1c1c65ef724b2e3f1c84205bc836 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Wed, 20 Jul 2022 01:02:00 +0800
Subject: [PATCH 139/178] Merge pull request #22179 from hanliutong:new-rvv

[GSoC] New universal intrinsic backend for RVV

* Add new rvv backend (partially implemented).

* Modify the framework of Universal Intrinsic.

* Add CV_SIMD macro guards to current UI code.

* Use vlanes() instead of nlanes.

* Modify the UI test.

* Enable the new RVV (scalable) backend.

* Remove whitespace.

* Rename and some others modify.

* Update intrin.hpp but still not work on AVX/SSE

* Update conditional compilation macros.

* Use static variable for vlanes.

* Use max_nlanes for array defining.
---
 modules/calib3d/src/stereosgbm.cpp            |  21 +-
 .../core/include/opencv2/core/hal/intrin.hpp  | 434 ++++++++++++-
 .../opencv2/core/hal/intrin_rvv_scalable.hpp  | 493 ++++++++++++++
 .../opencv2/core/hal/simd_utils.impl.hpp      |  42 +-
 modules/core/src/arithm.simd.hpp              |  20 +
 modules/core/test/test_intrin128.simd.hpp     |   2 +-
 modules/core/test/test_intrin_utils.hpp       | 610 ++++++++++++------
 .../fluid/gfluidcore_func.dispatch.cpp        |   5 +-
 .../src/backends/fluid/gfluidcore_func.hpp    |   2 +-
 modules/imgproc/src/color_lab.cpp             |   2 +
 modules/imgproc/src/color_yuv.simd.hpp        |   8 +
 modules/imgproc/src/median_blur.simd.hpp      | 203 ++----
 platforms/linux/riscv64-clang.toolchain.cmake |   4 +
 13 files changed, 1486 insertions(+), 360 deletions(-)
 create mode 100644 modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp

diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 70eb3c658a..e30973ec94 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -177,7 +177,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
+    int D = (int)alignSize(maxD - minD, VTraits<v_int16>::vlanes()), width1 = maxX1 - minX1;
     //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
     //That is needs of parallel algorithm
     xrange_min = (xrange_min < 0) ? 0: xrange_min;
@@ -502,8 +502,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     const int D = params.numDisparities;
     int width1 = maxX1 - minX1;
-    int Da = (int)alignSize(D, v_int16::nlanes);
-    int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+    int Da = (int)alignSize(D,VTraits<v_int16>::vlanes());
+    int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
     int npasses = params.isFullDP() ? 2 : 1;
@@ -977,11 +977,10 @@ struct CalcVerticalSums: public ParallelLoopBody
         width = img1.cols;
         int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
         D = maxD - minD;
-        Da = (int)alignSize(D, v_int16::nlanes);
-        Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+        Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
+        Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
         D = params.numDisparities;
-        Da = (int)alignSize(D, v_int16::nlanes);
     }
 
     void operator()(const Range& range) const CV_OVERRIDE
@@ -1235,8 +1234,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
         INVALID_DISP = minD - 1;
         INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
         D = maxD - minD;
-        Da = (int)alignSize(D, v_int16::nlanes);
-        Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+        Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
+        Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
     }
 
@@ -1484,8 +1483,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
     int width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     int width1 = maxX1 - minX1;
-    int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
-    int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+    int Da = (int)alignSize(params.numDisparities, VTraits<v_int16>::vlanes());
+    int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1;
     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 
@@ -1630,7 +1629,7 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
     width = img1->cols; height = img1->rows;
     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
     minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
-    Da = (int)alignSize(D, v_int16::nlanes);
+    Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
 
     SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
 
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index ac331f2154..0041030dad 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif
 
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -229,9 +229,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
 #include "opencv2/core/hal/intrin_wasm.hpp"
 
-#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
 #include "opencv2/core/hal/intrin_rvv.hpp"
-
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
+#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
 #else
 
 #include "opencv2/core/hal/intrin_cpp.hpp"
@@ -314,6 +315,14 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD512_FP16 0
 #endif
 
+#ifndef CV_SIMD_SCALABLE
+#define CV_SIMD_SCALABLE 0
+#endif
+
+#ifndef CV_SIMD_SCALABLE_64F
+#define CV_SIMD_SCALABLE_64F 0
+#endif
+
 //==================================================================================================
 
 template<typename _Tp> struct V_RegTraits
@@ -375,6 +384,18 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
     CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
 #endif
+#if CV_SIMD_SCALABLE
+    CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
+    CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
+#endif
 //! @endcond
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@@ -488,6 +509,17 @@ namespace CV__SIMD_NAMESPACE {
     #define VXPREFIX(func) v##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
+
+#elif CV_SIMD_SCALABLE
+#define CV__SIMD_NAMESPACE simd
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 0
+    #define CV_SIMD_WIDTH 128  /* 1024/8 */
+
+    #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+
 #endif
 
 namespace CV__SIMD_NAMESPACE {
@@ -663,6 +695,402 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
+#if CV_SIMD
+    // Compatibility layer
+    #define CV_SIMD_SCALABLE 0
+    #define CV_SIMD_SCALABLE_64F 0
+
+    template <class T>
+    struct VTraits;
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+    template <>
+    struct VTraits<v_uint8>
+    {
+        static inline int vlanes() { return v_uint8::nlanes; }
+        enum { nlanes = 64, max_nlanes = nlanes };
+        using lane_type = uchar;
+    };
+    template <>
+    struct VTraits<v_int8>
+    {
+        static inline int vlanes() { return v_int8::nlanes; }
+        enum { nlanes = 64, max_nlanes = nlanes };
+        using lane_type = schar;
+    };
+    template <>
+    struct VTraits<v_uint16>
+    {
+        static inline int vlanes() { return v_uint16::nlanes; }
+        enum { nlanes = 32, max_nlanes = nlanes };
+        using lane_type = ushort;
+    };
+    template <>
+    struct VTraits<v_int16>
+    {
+        static inline int vlanes() { return v_int16::nlanes;  }
+        enum { nlanes = 32, max_nlanes = nlanes };
+        using lane_type = short;
+    };
+    template <>
+    struct VTraits<v_uint32>
+    {
+        static inline int vlanes() { return v_uint32::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = uint;
+    };
+    template <>
+    struct VTraits<v_int32>
+    {
+        static inline int vlanes() { return v_int32::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = int;
+    };
+
+    template <>
+    struct VTraits<v_float32>
+    {
+        static inline int vlanes() { return v_float32::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = float;
+    };
+    template <>
+    struct VTraits<v_uint64>
+    {
+        static inline int vlanes() { return v_uint64::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = uint64;
+    };
+    template <>
+    struct VTraits<v_int64>
+    {
+        static inline int vlanes() { return v_int64::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = int64;
+    };
+    #if CV_SIMD_64F
+    template <>
+    struct VTraits<v_float64>
+    {
+        static inline int vlanes() { return v_float64::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = double;
+    };
+    #endif
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+    template <>
+    struct VTraits<v_uint8>
+    {
+        static inline int vlanes() { return v_uint8::nlanes; }
+        enum { nlanes = 32, max_nlanes = nlanes };
+        using lane_type = uchar;
+    };
+    template <>
+    struct VTraits<v_int8>
+    {
+        static inline int vlanes() { return v_int8::nlanes; }
+        enum { nlanes = 32, max_nlanes = nlanes };
+        using lane_type = schar;
+    };
+    template <>
+    struct VTraits<v_uint16>
+    {
+        static inline int vlanes() { return v_uint16::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = ushort;
+    };
+    template <>
+    struct VTraits<v_int16>
+    {
+        static inline int vlanes() { return v_int16::nlanes;  }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = short;
+    };
+    template <>
+    struct VTraits<v_uint32>
+    {
+        static inline int vlanes() { return v_uint32::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = uint;
+    };
+    template <>
+    struct VTraits<v_int32>
+    {
+        static inline int vlanes() { return v_int32::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = int;
+    };
+
+    template <>
+    struct VTraits<v_float32>
+    {
+        static inline int vlanes() { return v_float32::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = float;
+    };
+    template <>
+    struct VTraits<v_uint64>
+    {
+        static inline int vlanes() { return v_uint64::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = uint64;
+    };
+    template <>
+    struct VTraits<v_int64>
+    {
+        static inline int vlanes() { return v_int64::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = int64;
+    };
+    #if CV_SIMD_64F
+    template <>
+    struct VTraits<v_float64>
+    {
+        static inline int vlanes() { return v_float64::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = double;
+    };
+    #endif
+#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+    template <>
+    struct VTraits<v_uint8>
+    {
+        static inline int vlanes() { return v_uint8::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = uchar;
+    };
+    template <>
+    struct VTraits<v_int8>
+    {
+        static inline int vlanes() { return v_int8::nlanes; }
+        enum { nlanes = 16, max_nlanes = nlanes };
+        using lane_type = schar;
+    };
+    template <>
+    struct VTraits<v_uint16>
+    {
+        static inline int vlanes() { return v_uint16::nlanes; }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = ushort;
+    };
+    template <>
+    struct VTraits<v_int16>
+    {
+        static inline int vlanes() { return v_int16::nlanes;  }
+        enum { nlanes = 8, max_nlanes = nlanes };
+        using lane_type = short;
+    };
+    template <>
+    struct VTraits<v_uint32>
+    {
+        static inline int vlanes() { return v_uint32::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = uint;
+    };
+    template <>
+    struct VTraits<v_int32>
+    {
+        static inline int vlanes() { return v_int32::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = int;
+    };
+
+    template <>
+    struct VTraits<v_float32>
+    {
+        static inline int vlanes() { return v_float32::nlanes; }
+        enum { nlanes = 4, max_nlanes = nlanes };
+        using lane_type = float;
+    };
+    template <>
+    struct VTraits<v_uint64>
+    {
+        static inline int vlanes() { return v_uint64::nlanes; }
+        enum { nlanes = 2, max_nlanes = nlanes };
+        using lane_type = uint64;
+    };
+    template <>
+    struct VTraits<v_int64>
+    {
+        static inline int vlanes() { return v_int64::nlanes; }
+        enum { nlanes = 2, max_nlanes = nlanes };
+        using lane_type = int64;
+    };
+    #if CV_SIMD_64F
+    template <>
+    struct VTraits<v_float64>
+    {
+        static inline int vlanes() { return v_float64::nlanes; }
+        enum { nlanes = 2, max_nlanes = nlanes };
+        using lane_type = double;
+    };
+    #endif
+#endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a + b; \
+    } \
+    inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a - b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \
+        return v_add(f1 + f2, vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
+    inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a & b; \
+    } \
+    inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a | b; \
+    } \
+    inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a ^ b; \
+    } \
+    inline _Tpvec v_not(const _Tpvec& a) \
+    { \
+        return ~a; \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a * b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \
+        return v_mul(f1 * f2, vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+
+
+    inline v_float32 v_div(const v_float32& a, const v_float32& b) \
+    { \
+        return a / b; \
+    }
+    #if CV_SIMD_64F
+    inline v_float64 v_div(const v_float64& a, const v_float64& b) \
+    { \
+        return a / b; \
+    }
+    #endif
+
+    #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
+    inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a op b; \
+    }
+
+    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
+
+    OPENCV_HAL_WRAP_CMP(v_uint8)
+    OPENCV_HAL_WRAP_CMP(v_uint16)
+    OPENCV_HAL_WRAP_CMP(v_uint32)
+    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_CMP(v_int8)
+    OPENCV_HAL_WRAP_CMP(v_int16)
+    OPENCV_HAL_WRAP_CMP(v_int32)
+    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_CMP(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_CMP(v_float64)
+    #endif
+
+    //////////// get0 ////////////
+    #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
+    inline _Tp v_get0(v_##_Tpvec v) \
+    { \
+        return v.get0(); \
+    }
+
+    OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
+    OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
+    OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
+    OPENCV_HAL_WRAP_GRT0_INT(int16, short)
+    OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
+    OPENCV_HAL_WRAP_GRT0_INT(int32, int)
+    OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
+    OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
+    OPENCV_HAL_WRAP_GRT0_INT(float32, float)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_GRT0_INT(float64, double)
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
+    inline _Tp v_extract_highest(_Tpvec v) \
+    { \
+        return v_extract_n<vl-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(_Tpvec v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+
+#endif //CV_SIMD
 
 //! @cond IGNORED
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
new file mode 100644
index 0000000000..b984411436
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -0,0 +1,493 @@
+
+#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+
+#include <initializer_list>
+#include <assert.h>
+#include <vector>
+
+#ifndef CV_RVV_MAX_VLEN
+#define CV_RVV_MAX_VLEN 1024
+#endif
+
+namespace cv
+{
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD_SCALABLE 1
+#define CV_SIMD_SCALABLE_64F 1
+
+using v_uint8 = vuint8m1_t;
+using v_int8 = vint8m1_t;
+using v_uint16 = vuint16m1_t;
+using v_int16 = vint16m1_t;
+using v_uint32 = vuint32m1_t;
+using v_int32 = vint32m1_t;
+using v_uint64 = vuint64m1_t;
+using v_int64 = vint64m1_t;
+
+using v_float32 = vfloat32m1_t;
+#if CV_SIMD_SCALABLE_64F
+using v_float64 = vfloat64m1_t;
+#endif
+
+using uchar = unsigned char;
+using schar = signed char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint64 = unsigned long int;
+using int64 = long int;
+
+static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1();
+static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1();
+static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1();
+static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1();
+
+template <class T>
+struct VTraits;
+
+template <>
+struct VTraits<v_uint8>
+{
+    static inline int vlanes() { return __cv_rvv_e8_nlanes; }
+    using lane_type = uchar;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/8;
+};
+
+template <>
+struct VTraits<v_int8>
+{
+    static inline int vlanes() { return __cv_rvv_e8_nlanes; }
+    using lane_type = schar;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/8;
+};
+template <>
+struct VTraits<v_uint16>
+{
+    static inline int vlanes() { return __cv_rvv_e16_nlanes; }
+    using lane_type = ushort;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/16;
+};
+template <>
+struct VTraits<v_int16>
+{
+    static inline int vlanes() { return __cv_rvv_e16_nlanes; }
+    using lane_type = short;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/16;
+};
+template <>
+struct VTraits<v_uint32>
+{
+    static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+    using lane_type = uint;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+template <>
+struct VTraits<v_int32>
+{
+    static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+    using lane_type = int;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+
+template <>
+struct VTraits<v_float32>
+{
+    static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+    using lane_type = float;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+template <>
+struct VTraits<v_uint64>
+{
+    static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+    using lane_type = uint64;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+template <>
+struct VTraits<v_int64>
+{
+    static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+    using lane_type = int64;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+#if CV_SIMD_SCALABLE_64F
+template <>
+struct VTraits<v_float64>
+{
+    static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+    using lane_type = double;
+    static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+#endif
+
+//////////// get0 ////////////
+#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
+inline _Tp v_get0(v_##_Tpvec v) \
+{ \
+    return vmv_x(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
+
+inline float v_get0(v_float32 v) \
+{ \
+    return vfmv_f(v); \
+}
+#if CV_SIMD_SCALABLE_64F
+inline double v_get0(v_float64 v) \
+{ \
+    return vfmv_f(v); \
+}
+#endif
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+    return vmv_v_x_##suffix2##m1(0, vl); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+    return vmv_v_x_##suffix2##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    return vfmv_v_f_##suffix##m1(0, vl); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    return vfmv_v_f_##suffix##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+//////////// Reinterpret ////////////
+#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
+{ \
+    return v;\
+}
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
+#endif
+// TODO: can be simplified by using overloaded RV intrinsic
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
+// Three times reinterpret
+inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
+{ \
+    return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
+}
+
+inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
+{ \
+    return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
+}
+#endif
+
+
+////////////// Load/Store //////////////
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, hvl); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
+} \
+inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
+{ \
+    assert(nScalars.size() == vl); \
+    return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
+} \
+template<typename... Targs> \
+_Tpvec v_load_##suffix(Targs... nScalars) \
+{ \
+    return v_load({nScalars...}); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
+#endif
+
+////////////// Lookup table access ////////////////////
+#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
+{ \
+    vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+} \
+inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
+{ \
+    std::vector<uint> idx_; \
+    for (size_t i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
+        idx_.push_back(idx[i]); \
+        idx_.push_back(idx[i]+1); \
+    } \
+    vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+} \
+inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
+{ \
+    std::vector<uint> idx_; \
+    for (size_t i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
+        idx_.push_back(idx[i]); \
+        idx_.push_back(idx[i]+1); \
+        idx_.push_back(idx[i]+2); \
+        idx_.push_back(idx[i]+3); \
+    } \
+    vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
+OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
+OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
+OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
+OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
+
+inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
+
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return intrin(a, b, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits<v_int64>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
+#endif
+
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    return vget_##suffix##m1(temp, 0); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    return vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
+
+inline v_uint32 v_load_expand_q(const uchar* ptr)
+{
+    return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
+}
+
+inline v_int32 v_load_expand_q(const schar* ptr)
+{
+    return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
+}
+
+
+////// FP16 support ///////
+
+inline v_float32 v_load_expand(const float16_t* ptr)
+{
+    // TODO
+    return vundefined_f32m1();
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+} //namespace cv
+
+#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
\ No newline at end of file
diff --git a/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp b/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp
index fff8f942b8..0a1ab2c523 100644
--- a/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp
+++ b/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp
@@ -128,8 +128,48 @@ template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const
 
 #endif  // SIMD512
 
+#if CV_SIMD_SCALABLE
+template<typename _T> struct Type2Vec_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
+#if CV_SIMD_SCALABLE_64F
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
+#endif
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD_SCALABLE_64F
+template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+#endif
 
-#if CV_SIMD_WIDTH == 16
+
+#if CV_SIMD_SCALABLE
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 16
 template<typename _T> static inline
 typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
 #elif CV_SIMD_WIDTH == 32
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index f88597aacc..06ebfb7678 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -266,24 +266,30 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
+#if CV_SIMD
     static inline v_int8 r(const v_int8& a, const v_int8& b)
     { return v_absdiffs(a, b); }
+#endif
     static inline schar r(schar a, schar b)
     { return c_absdiff(a, b); }
 };
 template<>
 struct op_absdiff<short, v_int16>
 {
+#if CV_SIMD
     static inline v_int16 r(const v_int16& a, const v_int16& b)
     { return v_absdiffs(a, b); }
+#endif
     static inline short r(short a, short b)
     { return c_absdiff(a, b); }
 };
 template<>
 struct op_absdiff<int, v_int32>
 {
+#if CV_SIMD
     static inline v_int32 r(const v_int32& a, const v_int32& b)
     { return v_reinterpret_as_s32(v_absdiff(a, b)); }
+#endif
     static inline int r(int a, int b)
     { return c_absdiff(a, b); }
 };
@@ -1430,11 +1436,13 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
         return v_scalar * a * b;
     }
+#endif
     static inline T1 r(T1 a, T1 b, const T2* scalar)
     { return c_mul(a, b, *scalar); }
     static inline Tvec pre(const Tvec&, const Tvec& res)
@@ -1569,6 +1577,7 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1579,6 +1588,7 @@ struct op_div_scale
         const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
         return v_select(denom == v_zero, v_zero, res);
     }
+#endif
     static inline T1 r(T1 a, T1 denom, const T2* scalar)
     {
         CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
@@ -1589,11 +1599,13 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
         return a * v_scalar / b;
     }
+#endif
     static inline float r(float a, float denom, const float* scalar)
     { return c_div(a, denom, *scalar); }
 };
@@ -1673,11 +1685,13 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_alpha = vx_setall_f32(*scalar);
         return v_fma(a, v_alpha, b);
     }
+#endif
     static inline T1 r(T1 a, T1 b, const T2* scalar)
     { return c_add(a, b, *scalar); }
     static inline Tvec pre(const Tvec&, const Tvec& res)
@@ -1704,6 +1718,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
     {
         const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@@ -1711,6 +1726,7 @@ struct op_add_weighted
         const v_float32 v_gamma = vx_setall_f32(scalars[2]);
         return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
     }
+#endif
     static inline T1 r(T1 a, T1 b, const T2* scalars)
     { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
     static inline Tvec pre(const Tvec&, const Tvec& res)
@@ -1819,6 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1829,6 +1846,7 @@ struct op_recip
         const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
         return v_select(denom == v_zero, v_zero, res);
     }
+#endif
     static inline T1 r(T1 denom, const T2* scalar)
     {
         CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
@@ -1839,11 +1857,13 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
+#if CV_SIMD
     static inline v_float32 r(const v_float32& a, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
         return v_scalar / a;
     }
+#endif
     static inline float r(float denom, const float* scalar)
     { return c_div(*scalar, denom); }
 };
diff --git a/modules/core/test/test_intrin128.simd.hpp b/modules/core/test/test_intrin128.simd.hpp
index 1d9bee2d33..46e18020f7 100644
--- a/modules/core/test/test_intrin128.simd.hpp
+++ b/modules/core/test/test_intrin128.simd.hpp
@@ -7,7 +7,7 @@
 #include "opencv2/core/hal/intrin.hpp"
 #undef CV__SIMD_FORCE_WIDTH
 
-#if CV_SIMD_WIDTH != 16
+#if CV_SIMD && CV_SIMD_WIDTH != 16
 #error "Invalid build configuration"
 #endif
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 3f196f1342..5d4442f111 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -25,6 +25,7 @@ void test_hal_intrin_float16();
 template <typename R> struct Data;
 template <int N> struct initializer;
 
+#if CV_SIMD
 template <> struct initializer<64>
 {
     template <typename R> static R init(const Data<R> & d)
@@ -77,16 +78,85 @@ template <> struct initializer<2>
     }
 };
 
+#else
+
+template <> struct initializer<128>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
+        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
+        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63],
+        d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79],
+        d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95],
+        d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111],
+        d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]});
+    }
+};
+
+template <> struct initializer<64>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
+        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
+        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]});
+    }
+};
+
+template <> struct initializer<32>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]});
+    }
+};
+
+template <> struct initializer<16>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]});
+    }
+};
+
+template <> struct initializer<8>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]});
+    }
+};
+
+template <> struct initializer<4>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3]});
+    }
+};
+
+template <> struct initializer<2>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1]});
+    }
+};
+#endif
 //==================================================================================================
 
 template <typename R> struct Data
 {
-    typedef typename R::lane_type LaneType;
+    typedef typename VTraits<R>::lane_type LaneType;
     typedef typename V_TypeTraits<LaneType>::int_type int_type;
 
     Data()
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
             d[i] = (LaneType)(i + 1);
     }
     Data(LaneType val)
@@ -99,7 +169,7 @@ template <typename R> struct Data
     }
     operator R ()
     {
-        return initializer<R::nlanes>().init(*this);
+        return initializer<VTraits<R>::max_nlanes>().init(*this);
     }
     Data<R> & operator=(const R & r)
     {
@@ -108,17 +178,17 @@ template <typename R> struct Data
     }
     template <typename T> Data<R> & operator*=(T m)
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
             d[i] *= (LaneType)m;
         return *this;
     }
     template <typename T> Data<R> & operator+=(T m)
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
             d[i] += (LaneType)m;
         return *this;
     }
-    void fill(LaneType val, int s, int c = R::nlanes)
+    void fill(LaneType val, int s, int c = VTraits<R>::vlanes())
     {
         for (int i = s; i < c; ++i)
             d[i] = val;
@@ -129,26 +199,26 @@ template <typename R> struct Data
     }
     void reverse()
     {
-        for (int i = 0; i < R::nlanes / 2; ++i)
-            std::swap(d[i], d[R::nlanes - i - 1]);
+        for (int i = 0; i < VTraits<R>::vlanes() / 2; ++i)
+            std::swap(d[i], d[VTraits<R>::vlanes() - i - 1]);
     }
     const LaneType & operator[](int i) const
     {
 #if 0   // TODO: strange bug - AVX2 tests are failed with this
-        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::vlanes(), "");
 #else
-        CV_Assert(i >= 0 && i < R::nlanes);
+        CV_Assert(i >= 0 && i < VTraits<R>::max_nlanes);
 #endif
         return d[i];
     }
     LaneType & operator[](int i)
     {
-        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::max_nlanes, "");
         return d[i];
     }
     int_type as_int(int i) const
     {
-        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::max_nlanes, "");
         union
         {
             LaneType l;
@@ -159,11 +229,11 @@ template <typename R> struct Data
     }
     const LaneType * mid() const
     {
-        return d + R::nlanes / 2;
+        return d + VTraits<R>::vlanes() / 2;
     }
     LaneType * mid()
     {
-        return d + R::nlanes / 2;
+        return d + VTraits<R>::vlanes() / 2;
     }
     LaneType sum(int s, int c)
     {
@@ -174,11 +244,11 @@ template <typename R> struct Data
     }
     LaneType sum()
     {
-        return sum(0, R::nlanes);
+        return sum(0, VTraits<R>::vlanes());
     }
     bool operator==(const Data<R> & other) const
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
             if (d[i] != other.d[i])
                 return false;
         return true;
@@ -193,17 +263,17 @@ template <typename R> struct Data
     }
     bool isValue(uchar val) const
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
             if (d[i] != val)
                 return false;
         return true;
     }
-    LaneType d[R::nlanes];
+    LaneType d[VTraits<R>::max_nlanes];
 };
 
 template<typename R> struct AlignedData
 {
-    Data<R> CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned
+    Data<R> CV_DECL_ALIGNED(sizeof(typename VTraits<R>::lane_type)*VTraits<R>::max_nlanes) a; // aligned
     char dummy;
     Data<R> u; // unaligned
 };
@@ -211,11 +281,11 @@ template<typename R> struct AlignedData
 template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
 {
     out << "{ ";
-    for (int i = 0; i < R::nlanes; ++i)
+    for (int i = 0; i < VTraits<R>::vlanes(); ++i)
     {
-        // out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
+        // out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
         out << +d.d[i];
-        if (i + 1 < R::nlanes)
+        if (i + 1 < VTraits<R>::vlanes())
             out << ", ";
     }
     out << " }";
@@ -246,7 +316,7 @@ inline unsigned pack_saturate_cast<unsigned, uint64>(uint64 a) { return static_c
 
 template<typename R> struct TheTest
 {
-    typedef typename R::lane_type LaneType;
+    typedef typename VTraits<R>::lane_type LaneType;
 
     template <typename T1, typename T2>
     static inline void EXPECT_COMPARE_EQ(const T1 a, const T2 b)
@@ -260,37 +330,37 @@ template<typename R> struct TheTest
         AlignedData<R> out;
 
         // check if addresses are aligned and unaligned respectively
-        EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
-        EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
 
         // check some initialization methods
         R r1 = data.a;
         R r2 = vx_load(data.u.d);
         R r3 = vx_load_aligned(data.a.d);
         R r4(r2);
-        EXPECT_EQ(data.a[0], r1.get0());
-        EXPECT_EQ(data.u[0], r2.get0());
-        EXPECT_EQ(data.a[0], r3.get0());
-        EXPECT_EQ(data.u[0], r4.get0());
+        EXPECT_EQ(data.a[0], v_get0(r1));
+        EXPECT_EQ(data.u[0], v_get0(r2));
+        EXPECT_EQ(data.a[0], v_get0(r3));
+        EXPECT_EQ(data.u[0], v_get0(r4));
 
         R r_low = vx_load_low((LaneType*)data.u.d);
-        EXPECT_EQ(data.u[0], r_low.get0());
+        EXPECT_EQ(data.u[0], v_get0(r_low));
         v_store(out.u.d, r_low);
-        for (int i = 0; i < R::nlanes/2; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes()/2; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
         }
 
-        R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2)));
-        EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
+        R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes() / 2)));
+        EXPECT_EQ(data.u[VTraits<R>::vlanes()/2], v_get0(r_low_align8byte));
         v_store(out.u.d, r_low_align8byte);
-        for (int i = 0; i < R::nlanes/2; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes()/2; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
+            EXPECT_EQ((LaneType)data.u[i + VTraits<R>::vlanes()/2], (LaneType)out.u[i]);
         }
 
         // check some store methods
@@ -318,7 +388,7 @@ template<typename R> struct TheTest
         Data<R> resZ, resV;
         resZ.fill((LaneType)0);
         resV.fill((LaneType)8);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)0, resZ[i]);
@@ -348,13 +418,16 @@ template<typename R> struct TheTest
 #elif CV_SIMD_WIDTH == 64
         R setall_res1 = v512_setall((LaneType)5);
         R setall_res2 = v512_setall<LaneType>(6);
+#elif CV_SIMD_SCALABLE
+        R setall_res1 = v_setall((LaneType)5);
+        R setall_res2 = v_setall<LaneType>(6);
 #else
 #error "Configuration error"
 #endif
 #if CV_SIMD_WIDTH > 0
         Data<R> setall_res1_; v_store(setall_res1_.d, setall_res1);
         Data<R> setall_res2_; v_store(setall_res2_.d, setall_res2);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)5, setall_res1_[i]);
@@ -366,7 +439,7 @@ template<typename R> struct TheTest
         R vx_setall_res2 = vx_setall<LaneType>(12);
         Data<R> vx_setall_res1_; v_store(vx_setall_res1_.d, vx_setall_res1);
         Data<R> vx_setall_res2_; v_store(vx_setall_res2_.d, vx_setall_res2);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)11, vx_setall_res1_[i]);
@@ -378,14 +451,14 @@ template<typename R> struct TheTest
             uint64 a = CV_BIG_INT(0x7fffffffffffffff);
             uint64 b = (uint64)CV_BIG_INT(0xcfffffffffffffff);
             v_uint64x2 uint64_vec(a, b);
-            EXPECT_EQ(a, uint64_vec.get0());
+            EXPECT_EQ(a, v_get0(uint64_vec));
             EXPECT_EQ(b, v_extract_n<1>(uint64_vec));
         }
         {
             int64 a = CV_BIG_INT(0x7fffffffffffffff);
             int64 b = CV_BIG_INT(-1);
             v_int64x2 int64_vec(a, b);
-            EXPECT_EQ(a, int64_vec.get0());
+            EXPECT_EQ(a, v_get0(int64_vec));
             EXPECT_EQ(b, v_extract_n<1>(int64_vec));
         }
 #endif
@@ -404,8 +477,8 @@ template<typename R> struct TheTest
         R a = data1, b = data2, c = data3;
         R d = data1, e = data2, f = data3, g = data4;
 
-        LaneType buf3[R::nlanes * 3];
-        LaneType buf4[R::nlanes * 4];
+        LaneType buf3[VTraits<R>::max_nlanes * 3];
+        LaneType buf4[VTraits<R>::max_nlanes * 4];
 
         v_store_interleave(buf3, a, b, c);
         v_store_interleave(buf4, d, e, f, g);
@@ -416,7 +489,7 @@ template<typename R> struct TheTest
         v_load_deinterleave(buf3, a, b, c);
         v_load_deinterleave(buf4, d, e, f, g);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(data1, Data<R>(a));
@@ -440,7 +513,7 @@ template<typename R> struct TheTest
 
         R a = data1, b = data2;
 
-        LaneType buf2[R::nlanes * 2];
+        LaneType buf2[VTraits<R>::max_nlanes * 2];
 
         v_store_interleave(buf2, a, b);
 
@@ -449,7 +522,7 @@ template<typename R> struct TheTest
 
         v_load_deinterleave(buf2, a, b);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(data1, Data<R>(a));
@@ -475,7 +548,7 @@ template<typename R> struct TheTest
         f = v_expand_high(a);
 
         Data<Rx2> resC = c, resD = d, resE = e, resF = f;
-        const int n = Rx2::nlanes;
+        const int n = VTraits<Rx2>::vlanes();
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
@@ -494,7 +567,7 @@ template<typename R> struct TheTest
         typedef typename V_RegTraits<R>::q_reg Rx4;
         Data<R> data;
         Data<Rx4> out = vx_load_expand_q(data.d);
-        const int n = Rx4::nlanes;
+        const int n = VTraits<Rx4>::vlanes();
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
@@ -510,8 +583,8 @@ template<typename R> struct TheTest
         dataB.reverse();
         R a = dataA, b = dataB;
 
-        Data<R> resC = a + b, resD = a - b;
-        for (int i = 0; i < R::nlanes; ++i)
+        Data<R> resC = v_add(a, b), resD = v_sub(a, b);
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
@@ -530,7 +603,7 @@ template<typename R> struct TheTest
         Data<R> resC = v_add_wrap(a, b),
                 resD = v_sub_wrap(a, b),
                 resE = v_mul_wrap(a, b);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
@@ -547,8 +620,8 @@ template<typename R> struct TheTest
         dataB.reverse();
         R a = dataA, b = dataB;
 
-        Data<R> resC = a * b;
-        for (int i = 0; i < R::nlanes; ++i)
+        Data<R> resC = v_mul(a, b);
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
@@ -563,8 +636,8 @@ template<typename R> struct TheTest
         dataB.reverse();
         R a = dataA, b = dataB;
 
-        Data<R> resC = a / b;
-        for (int i = 0; i < R::nlanes; ++i)
+        Data<R> resC = v_div(a, b);
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
@@ -583,12 +656,12 @@ template<typename R> struct TheTest
         v_mul_expand(a, b, c, d);
 
         Data<Rx2> resC = c, resD = d;
-        const int n = R::nlanes / 2;
+        const int n = VTraits<R>::vlanes() / 2;
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
-            EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
+            EXPECT_EQ((typename VTraits<Rx2>::lane_type)dataA[i] * dataB[i], resC[i]);
+            EXPECT_EQ((typename VTraits<Rx2>::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
         }
 
         return *this;
@@ -603,11 +676,11 @@ template<typename R> struct TheTest
         R c = v_mul_hi(a, b);
 
         Data<R> resC = c;
-        const int n = R::nlanes / 2;
+        const int n = VTraits<R>::vlanes() / 2;
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
+            EXPECT_EQ((typename VTraits<R>::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
         }
 
         return *this;
@@ -616,17 +689,19 @@ template<typename R> struct TheTest
     TheTest & test_abs()
     {
         typedef typename V_RegTraits<R>::u_reg Ru;
-        typedef typename Ru::lane_type u_type;
+        typedef typename VTraits<Ru>::lane_type u_type;
+        typedef typename VTraits<R>::lane_type R_type;
         Data<R> dataA, dataB(10);
         R a = dataA, b = dataB;
-        a = a - b;
+        a = v_sub(a, b);
 
         Data<Ru> resC = v_abs(a);
 
-        for (int i = 0; i < Ru::nlanes; ++i)
+        for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
+            R_type ssub = dataA[i] - dataB[i] < std::numeric_limits<R_type>::min() ? std::numeric_limits<R_type>::min() : dataA[i] - dataB[i];
+            EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
         }
 
         return *this;
@@ -640,9 +715,9 @@ template<typename R> struct TheTest
         dataA[0] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
         R a = dataA;
 
-        Data<R> resB = a << s, resC = v_shl<s>(a), resD = a >> s, resE = v_shr<s>(a);
+        Data<R> resB = v_shl<s>(a), resC = v_shl<s>(a), resD = v_shr<s>(a), resE = v_shr<s>(a);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resB[i]);
@@ -660,14 +735,14 @@ template<typename R> struct TheTest
         dataB += 1;
         R a = dataA, b = dataB;
 
-        Data<R> resC = (a == b);
-        Data<R> resD = (a != b);
-        Data<R> resE = (a > b);
-        Data<R> resF = (a >= b);
-        Data<R> resG = (a < b);
-        Data<R> resH = (a <= b);
+        Data<R> resC = (v_eq(a, b));
+        Data<R> resD = (v_ne(a, b));
+        Data<R> resE = (v_gt(a, b));
+        Data<R> resF = (v_ge(a, b));
+        Data<R> resG = (v_lt(a, b));
+        Data<R> resH = (v_le(a, b));
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
@@ -683,23 +758,23 @@ template<typename R> struct TheTest
     TheTest & test_dotprod()
     {
         typedef typename V_RegTraits<R>::w_reg Rx2;
-        typedef typename Rx2::lane_type w_type;
+        typedef typename VTraits<Rx2>::lane_type w_type;
 
         Data<R> dataA, dataB;
-        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
-        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+        dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
+        dataB += std::numeric_limits<LaneType>::min() + VTraits<R>::vlanes();
         R a = dataA, b = dataB;
 
         Data<Rx2> dataC;
         dataC += std::numeric_limits<w_type>::is_signed ?
                     std::numeric_limits<w_type>::min() :
-                    std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
+                    std::numeric_limits<w_type>::max() - VTraits<R>::vlanes() * (dataB[0] + 1);
         Rx2 c = dataC;
 
         Data<Rx2> resD = v_dotprod(a, b),
                   resE = v_dotprod(a, b, c);
 
-        const int n = R::nlanes / 2;
+        const int n = VTraits<R>::vlanes() / 2;
         w_type sumAB = 0, sumABC = 0, tmp_sum;
         for (int i = 0; i < n; ++i)
         {
@@ -725,11 +800,11 @@ template<typename R> struct TheTest
     TheTest & test_dotprod_expand()
     {
         typedef typename V_RegTraits<R>::q_reg Rx4;
-        typedef typename Rx4::lane_type l4_type;
+        typedef typename VTraits<Rx4>::lane_type l4_type;
 
         Data<R> dataA, dataB;
-        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
-        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+        dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
+        dataB += std::numeric_limits<LaneType>::min() + VTraits<R>::vlanes();
         R a = dataA, b = dataB;
 
         Data<Rx4> dataC;
@@ -739,7 +814,7 @@ template<typename R> struct TheTest
                   resE = v_dotprod_expand(a, b, c);
 
         l4_type sumAB = 0, sumABC = 0, tmp_sum;
-        for (int i = 0; i < Rx4::nlanes; ++i)
+        for (int i = 0; i < VTraits<Rx4>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             tmp_sum  = (l4_type)dataA[i*4]     * (l4_type)dataB[i*4]     +
@@ -766,7 +841,7 @@ template<typename R> struct TheTest
     {
     #if CV_SIMD_64F
         Data<R> dataA, dataB;
-        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
         dataB += std::numeric_limits<LaneType>::min();
         R a = dataA, b = dataB;
 
@@ -777,7 +852,7 @@ template<typename R> struct TheTest
                         resB = v_dotprod_expand(b, b),
                         resC = v_dotprod_expand(a, b, c);
 
-        const int n = R::nlanes / 2;
+        const int n = VTraits<R>::vlanes() / 2;
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
@@ -797,8 +872,8 @@ template<typename R> struct TheTest
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
 
-        Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
-        for (int i = 0; i < R::nlanes; ++i)
+        Data<R> resC = v_and(a, b), resD = v_or(a, b), resE = v_xor(a, b), resF = v_not(a);
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
@@ -817,7 +892,7 @@ template<typename R> struct TheTest
         R a = dataA, d = dataD;
 
         Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
@@ -835,7 +910,7 @@ template<typename R> struct TheTest
         R a = dataA, b = dataB;
 
         Data<R> resC = v_min(a, b), resD = v_max(a, b);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
@@ -863,7 +938,7 @@ template<typename R> struct TheTest
         R a = dataA;
 
         Data<Ru> resB = v_popcount(a);
-        for (int i = 0; i < Ru::nlanes; ++i)
+        for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
             EXPECT_EQ(popcountTable[i + 1], resB[i]);
 
         return *this;
@@ -872,7 +947,7 @@ template<typename R> struct TheTest
     TheTest & test_absdiff()
     {
         typedef typename V_RegTraits<R>::u_reg Ru;
-        typedef typename Ru::lane_type u_type;
+        typedef typename VTraits<Ru>::lane_type u_type;
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
         dataA[0] = (LaneType)-1;
@@ -882,7 +957,7 @@ template<typename R> struct TheTest
         R a = dataA, b = dataB;
         Data<Ru> resC = v_absdiff(a, b);
         const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
-        for (int i = 0; i < Ru::nlanes; ++i)
+        for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             u_type uA = dataA[i] ^ mask;
@@ -902,7 +977,7 @@ template<typename R> struct TheTest
         dataB[1] = -2;
         R a = dataA, b = dataB;
         Data<R> resC = v_absdiff(a, b);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
@@ -920,7 +995,7 @@ template<typename R> struct TheTest
         dataB[1] = (LaneType)-2;
         R a = dataA, b = dataB;
         Data<R> resC = v_absdiffs(a, b);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
         }
@@ -930,27 +1005,40 @@ template<typename R> struct TheTest
     TheTest & test_reduce()
     {
         Data<R> dataA;
+        LaneType min = VTraits<R>::vlanes(), max = 0;
         int sum = 0;
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
+            min = std::min<LaneType>(min, dataA[i]);
+            max = std::max<LaneType>(max, dataA[i]);
             sum += (int)(dataA[i]);   // To prevent a constant overflow with int8
         }
         R a = dataA;
-        EXPECT_EQ((LaneType)1, (LaneType)v_reduce_min(a));
-        EXPECT_EQ((LaneType)(R::nlanes), (LaneType)v_reduce_max(a));
+        EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(a));
+        EXPECT_EQ((LaneType)max, (LaneType)v_reduce_max(a));
         EXPECT_EQ((int)(sum), (int)v_reduce_sum(a));
-        dataA[0] += R::nlanes;
+        dataA[0] += VTraits<R>::vlanes();
         R an = dataA;
-        EXPECT_EQ((LaneType)2, (LaneType)v_reduce_min(an));
+        min = VTraits<R>::vlanes();
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            min = std::min<LaneType>(min, dataA[i]);
+        }
+        EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(an));
         return *this;
     }
 
     TheTest & test_reduce_sad()
     {
-        Data<R> dataA, dataB(R::nlanes/2);
+        Data<R> dataA, dataB(VTraits<R>::vlanes()/2);
         R a = dataA;
         R b = dataB;
-        EXPECT_EQ((unsigned)(R::nlanes*R::nlanes/4), v_reduce_sad(a, b));
+        uint sum = 0;
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            sum += std::abs(int(dataA[i] - dataB[i]));
+        }
+        EXPECT_EQ(sum, v_reduce_sad(a, b));
         return *this;
     }
 
@@ -958,8 +1046,8 @@ template<typename R> struct TheTest
     {
         typedef typename V_RegTraits<R>::int_reg int_reg;
         typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
-        typedef typename int_reg::lane_type int_type;
-        typedef typename uint_reg::lane_type uint_type;
+        typedef typename VTraits<int_reg>::lane_type int_type;
+        typedef typename VTraits<uint_reg>::lane_type uint_type;
 
         Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
         dataA[1] *= (LaneType)-1;
@@ -971,18 +1059,18 @@ template<typename R> struct TheTest
         all1s;
         all1s.ui = (uint_type)-1;
         LaneType mask_one = all1s.l;
-        dataB[R::nlanes - 1] = mask_one;
+        dataB[VTraits<R>::vlanes() - 1] = mask_one;
         R l = dataB;
         dataB[1] = mask_one;
-        dataB[R::nlanes / 2] = mask_one;
+        dataB[VTraits<R>::vlanes() / 2] = mask_one;
         dataC *= (LaneType)-1;
         R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
-        dataC[R::nlanes - 1] = 0;
+        dataC[VTraits<R>::vlanes() - 1] = 0;
         R nl = dataC;
 
         EXPECT_EQ(2, v_signmask(a));
-#if CV_SIMD_WIDTH <= 32
-        EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
+#if (CV_SIMD_WIDTH <= 32) && (!CV_SIMD_SCALABLE)
+        EXPECT_EQ(2 | (1 << (VTraits<R>::vlanes() / 2)) | (1 << (VTraits<R>::vlanes() - 1)), v_signmask(b));
 #endif
 
         EXPECT_EQ(false, v_check_all(a));
@@ -996,7 +1084,7 @@ template<typename R> struct TheTest
         EXPECT_EQ(true, v_check_any(l));
         R f = v_select(b, d, e);
         Data<R> resF = f;
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             int_type m2 = dataB.as_int(i);
@@ -1011,7 +1099,7 @@ template<typename R> struct TheTest
     {
         SCOPED_TRACE(s);
         typedef typename V_RegTraits<R>::w_reg Rx2;
-        typedef typename Rx2::lane_type w_type;
+        typedef typename VTraits<Rx2>::lane_type w_type;
         Data<Rx2> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
         dataB *= 10;
@@ -1027,7 +1115,7 @@ template<typename R> struct TheTest
         Data<R> resF(0);
         v_rshr_pack_store<s>(resF.d, b);
 
-        const int n = Rx2::nlanes;
+        const int n = VTraits<Rx2>::vlanes();
         const w_type add = (w_type)1 << (s - 1);
         for (int i = 0; i < n; ++i)
         {
@@ -1051,7 +1139,7 @@ template<typename R> struct TheTest
         //typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
         typedef typename V_RegTraits<R>::w_reg R2;
         typedef typename V_RegTraits<R2>::int_reg Ri2;
-        typedef typename Ri2::lane_type w_type;
+        typedef typename VTraits<Ri2>::lane_type w_type;
 
         Data<Ri2> dataA, dataB;
         dataA += -10;
@@ -1068,7 +1156,7 @@ template<typename R> struct TheTest
         Data<R> resF(0);
         v_rshr_pack_u_store<s>(resF.d, b);
 
-        const int n = Ri2::nlanes;
+        const int n = VTraits<Ri2>::vlanes();
         const w_type add = (w_type)1 << (s - 1);
         for (int i = 0; i < n; ++i)
         {
@@ -1090,26 +1178,26 @@ template<typename R> struct TheTest
     {
         // 16-bit
         Data<R> dataA, dataB;
-        dataB.fill(0, R::nlanes / 2);
+        dataB.fill(0, VTraits<R>::vlanes() / 2);
 
         R a = dataA, b = dataB;
-        Data<R> maskA = a == b, maskB = a != b;
+        Data<R> maskA = v_eq(a, b), maskB = v_ne(a, b);
 
         a = maskA; b = maskB;
         Data<R> res  = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
-        for (int i = 0; i < v_uint16::nlanes; ++i)
+        for (int i = 0; i < VTraits<v_uint16>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(maskA[i * 2], res[i]);
-            EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
+            EXPECT_EQ(maskB[i * 2], res[i + VTraits<v_uint16>::vlanes()]);
         }
 
         // 32-bit
         Data<R> dataC, dataD;
-        dataD.fill(0, R::nlanes / 2);
+        dataD.fill(0, VTraits<R>::vlanes() / 2);
 
         R c = dataC, d = dataD;
-        Data<R> maskC = c == d, maskD = c != d;
+        Data<R> maskC = v_eq(c, d), maskD = v_ne(c, d);
 
         c = maskC; d = maskD;
         res = v_pack_b
@@ -1118,21 +1206,21 @@ template<typename R> struct TheTest
             v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
         );
 
-        for (int i = 0; i < v_uint32::nlanes; ++i)
+        for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(maskA[i * 4], res[i]);
-            EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
-            EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
-            EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
+            EXPECT_EQ(maskB[i * 4], res[i + VTraits<v_uint32>::vlanes()]);
+            EXPECT_EQ(maskC[i * 4], res[i + VTraits<v_uint32>::vlanes() * 2]);
+            EXPECT_EQ(maskD[i * 4], res[i + VTraits<v_uint32>::vlanes() * 3]);
         }
 
         // 64-bit
         Data<R> dataE, dataF, dataG(0), dataH(0xFF);
-        dataF.fill(0, R::nlanes / 2);
+        dataF.fill(0, VTraits<R>::vlanes() / 2);
 
         R e = dataE, f = dataF, g = dataG, h = dataH;
-        Data<R> maskE = e == f, maskF = e != f;
+        Data<R> maskE = v_eq(e, f), maskF = v_ne(e, f);
 
         e = maskE; f = maskF;
         res = v_pack_b
@@ -1143,18 +1231,18 @@ template<typename R> struct TheTest
             v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
         );
 
-        for (int i = 0; i < v_uint64::nlanes; ++i)
+        for (int i = 0; i < VTraits<v_uint64>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(maskA[i * 8], res[i]);
-            EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
-            EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
-            EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
-
-            EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
-            EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
-            EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
-            EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
+            EXPECT_EQ(maskB[i * 8], res[i + VTraits<v_uint64>::vlanes()]);
+            EXPECT_EQ(maskC[i * 8], res[i + VTraits<v_uint64>::vlanes() * 2]);
+            EXPECT_EQ(maskD[i * 8], res[i + VTraits<v_uint64>::vlanes() * 3]);
+
+            EXPECT_EQ(maskE[i * 8], res[i + VTraits<v_uint64>::vlanes() * 4]);
+            EXPECT_EQ(maskF[i * 8], res[i + VTraits<v_uint64>::vlanes() * 5]);
+            EXPECT_EQ(dataG[i * 8], res[i + VTraits<v_uint64>::vlanes() * 6]);
+            EXPECT_EQ(dataH[i * 8], res[i + VTraits<v_uint64>::vlanes() * 7]);
         }
 
         return *this;
@@ -1174,7 +1262,7 @@ template<typename R> struct TheTest
 
         Data<R> resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi;
 
-        const int n = R::nlanes/2;
+        const int n = VTraits<R>::vlanes()/2;
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
@@ -1204,10 +1292,10 @@ template<typename R> struct TheTest
 
         Data<R> resB = v_reverse(a);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]);
+            EXPECT_EQ(dataA[VTraits<R>::vlanes() - i - 1], resB[i]);
         }
 
         return *this;
@@ -1223,11 +1311,11 @@ template<typename R> struct TheTest
 
         Data<R> resC = v_extract<s>(a, b);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            if (i + s >= R::nlanes)
-                EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
+            if (i + s >= VTraits<R>::vlanes())
+                EXPECT_EQ(dataB[i - VTraits<R>::vlanes() + s], resC[i]);
             else
                 EXPECT_EQ(dataA[i + s], resC[i]);
         }
@@ -1249,16 +1337,16 @@ template<typename R> struct TheTest
         Data<R> resE = v_rotate_left<s>(a);
         Data<R> resF = v_rotate_left<s>(a, b);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            if (i + s >= R::nlanes)
+            if (i + s >= VTraits<R>::vlanes())
             {
                 EXPECT_EQ((LaneType)0, resC[i]);
-                EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
+                EXPECT_EQ(dataB[i - VTraits<R>::vlanes() + s], resD[i]);
 
-                EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]);
-                EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]);
+                EXPECT_EQ((LaneType)0, resE[i - VTraits<R>::vlanes() + s]);
+                EXPECT_EQ(dataB[i], resF[i - VTraits<R>::vlanes() + s]);
             }
             else
             {
@@ -1287,6 +1375,19 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_extract_highest()
+    {
+        Data<R> dataA;
+        LaneType test_value = (LaneType)(VTraits<R>::vlanes()-1 + 50);
+        dataA[VTraits<R>::vlanes()-1] = test_value;
+        R a = dataA;
+
+        LaneType res = v_extract_highest(a);
+        EXPECT_EQ(test_value, res);
+
+        return *this;
+    }
+
     template<int s>
     TheTest & test_broadcast_element()
     {
@@ -1298,7 +1399,24 @@ template<typename R> struct TheTest
 
         Data<R> res = v_broadcast_element<s>(a);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            SCOPED_TRACE(i);
+            EXPECT_EQ(test_value, res[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_broadcast_highest()
+    {
+        Data<R> dataA;
+        LaneType test_value = (LaneType)(VTraits<R>::vlanes()-1 + 50);
+        dataA[VTraits<R>::vlanes()-1] = test_value;
+        R a = dataA;
+
+        Data<R> res = v_broadcast_highest(a);
+
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(i);
             EXPECT_EQ(test_value, res[i]);
@@ -1323,11 +1441,11 @@ template<typename R> struct TheTest
                 resG = v_sqr_magnitude(a1, a2),
                 resH = v_muladd(a1, a2, a3);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(cvRound(data1[i]), resB[i]);
-            EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
+            EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
             EXPECT_EQ(cvFloor(data1[i]), resD[i]);
             EXPECT_EQ(cvCeil(data1[i]), resE[i]);
 
@@ -1347,11 +1465,11 @@ template<typename R> struct TheTest
         R a = dataA;
         Rt b = v_cvt_f32(a);
         Data<Rt> resB = b;
-        int n = std::min<int>(Rt::nlanes, R::nlanes);
+        int n = std::min<int>(VTraits<Rt>::vlanes(), VTraits<R>::vlanes());
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+            EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i], resB[i]);
         }
         return *this;
     }
@@ -1367,16 +1485,16 @@ template<typename R> struct TheTest
         Rt c = v_cvt_f64_high(a);
         Data<Rt> resB = b;
         Data<Rt> resC = c;
-        int n = std::min<int>(Rt::nlanes, R::nlanes);
+        int n = std::min<int>(VTraits<Rt>::vlanes(), VTraits<R>::vlanes());
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+            EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i], resB[i]);
         }
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ((typename Rt::lane_type)dataA[i+n], resC[i]);
+            EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i+n], resC[i]);
         }
 #endif
         return *this;
@@ -1387,7 +1505,7 @@ template<typename R> struct TheTest
 #if CV_SIMD_64F
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
-        dataB += R::nlanes;
+        dataB += VTraits<R>::vlanes();
 
         R a = dataA, b = dataB;
         v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b);
@@ -1395,7 +1513,7 @@ template<typename R> struct TheTest
         Data<v_float64> resC = c;
         Data<v_float64> resD = d;
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((double)dataA[i], resC[i]);
@@ -1414,8 +1532,9 @@ template<typename R> struct TheTest
         R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
 
         Data<R> res = v_matmul(v, a, b, c, d);
-        for (int i = 0; i < R::nlanes; i += 4)
-        {
+        // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+        // {
+            int i = 0;
             for (int j = i; j < i + 4; ++j)
             {
                 SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
@@ -1425,11 +1544,12 @@ template<typename R> struct TheTest
                              + dataV[i + 3] * dataD[j];
                 EXPECT_COMPARE_EQ(val, res[j]);
             }
-        }
+        // }
 
         Data<R> resAdd = v_matmuladd(v, a, b, c, d);
-        for (int i = 0; i < R::nlanes; i += 4)
-        {
+        // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+        // {
+            i = 0;
             for (int j = i; j < i + 4; ++j)
             {
                 SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
@@ -1439,7 +1559,7 @@ template<typename R> struct TheTest
                              + dataD[j];
                 EXPECT_COMPARE_EQ(val, resAdd[j]);
             }
-        }
+        // }
         return *this;
     }
 
@@ -1455,8 +1575,9 @@ template<typename R> struct TheTest
                        e, f, g, h);
 
         Data<R> res[4] = {e, f, g, h};
-        for (int i = 0; i < R::nlanes; i += 4)
-        {
+        // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+        // {
+            int i = 0;
             for (int j = 0; j < 4; ++j)
             {
                 SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
@@ -1465,7 +1586,7 @@ template<typename R> struct TheTest
                 EXPECT_EQ(dataC[i + j], res[j][i + 2]);
                 EXPECT_EQ(dataD[i + j], res[j][i + 3]);
             }
-        }
+        // }
         return *this;
     }
 
@@ -1479,14 +1600,15 @@ template<typename R> struct TheTest
         R a = dataA, b = dataB, c = dataC, d = dataD;
         Data<R> res = v_reduce_sum4(a, b, c, d);
 
-        for (int i = 0; i < R::nlanes; i += 4)
-        {
+        // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+        // {
+            int i = 0;
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
             EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
             EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
             EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]);
-        }
+        // }
         return *this;
     }
 
@@ -1495,19 +1617,19 @@ template<typename R> struct TheTest
         printf("test_loadstore_fp16_f32 ...\n");
         AlignedData<v_uint16> data; data.a.clear();
         data.a.d[0] = 0x3c00; // 1.0
-        data.a.d[R::nlanes - 1] = (unsigned short)0xc000; // -2.0
+        data.a.d[VTraits<R>::vlanes() - 1] = (unsigned short)0xc000; // -2.0
         AlignedData<v_float32> data_f32; data_f32.a.clear();
         AlignedData<v_uint16> out;
 
         R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
         R r2(r1);
-        EXPECT_EQ(1.0f, r1.get0());
+        EXPECT_EQ(1.0f, v_get0(r1));
         v_store(data_f32.a.d, r2);
-        EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
+        EXPECT_EQ(-2.0f, data_f32.a.d[VTraits<R>::vlanes() - 1]);
 
         out.a.clear();
         v_pack_store((cv::float16_t*)out.a.d, r2);
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
         }
@@ -1523,18 +1645,18 @@ template<typename R> struct TheTest
         AlignedData<R> out;
 
         // check if addresses are aligned and unaligned respectively
-        EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
-        EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
-        EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % VTraits<R>::max_nlanes);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % VTraits<R>::max_nlanes);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % VTraits<R>::max_nlanes);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % VTraits<R>::max_nlanes);
 
         // check some initialization methods
         R r1 = data.u;
         R r2 = vx_load_expand((const float16_t*)data.a.d);
         R r3(r2);
-        EXPECT_EQ(data.u[0], r1.get0());
-        EXPECT_EQ(data.a[0], r2.get0());
-        EXPECT_EQ(data.a[0], r3.get0());
+        EXPECT_EQ(data.u[0], v_get0(r1));
+        EXPECT_EQ(data.a[0], v_get0(r2));
+        EXPECT_EQ(data.a[0], v_get0(r3));
 
         // check some store methods
         out.a.clear();
@@ -1552,8 +1674,8 @@ template<typename R> struct TheTest
         v_float32 r1 = vx_load(data.a.d);
         v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
         v_float32 r3 = v_cvt_f32(r2);
-        EXPECT_EQ(0x3c00, r2.get0());
-        EXPECT_EQ(r3.get0(), r1.get0());
+        EXPECT_EQ(0x3c00, v_get0(r2));
+        EXPECT_EQ(v_get0(r3), v_get0(r1));
 
         return *this;
     }
@@ -1565,7 +1687,7 @@ template<typename R> struct TheTest
         Data<R> dataA, dataB;
         R a = dataA, b = dataB;
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             dataA[i] = dataB[i];
         }
@@ -1576,14 +1698,14 @@ template<typename R> struct TheTest
         Data<R> resC = (a == b);
         Data<R> resD = (a != b);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
             EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
         }
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             dataA[i] = dataB[i] = (LaneType)-1;
         }
@@ -1593,7 +1715,7 @@ template<typename R> struct TheTest
         resC = (a == b);
         resD = (a != b);
 
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
@@ -1605,10 +1727,8 @@ template<typename R> struct TheTest
 };
 
 
-#if 1
+#if CV_SIMD
 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION);
-#endif
-
 //=============  8-bit integer =====================================================================
 
 void test_hal_intrin_uint8()
@@ -1955,6 +2075,122 @@ void test_hal_intrin_float16()
     std::cout << "SKIP: CV_FP16 is not available" << std::endl;
 #endif
 }
+#elif CV_SIMD_SCALABLE //Temporary
+#define DUMP_ENTRY(type) printf("SIMD: %s\n", CV__TRACE_FUNCTION);
+
+
+//=============  8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+    DUMP_ENTRY(v_uint8);
+    // typedef v_uint8 R;
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int8()
+{
+    DUMP_ENTRY(v_int8);
+    // typedef v_int8 R;
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+    DUMP_ENTRY(v_uint16);
+    // typedef v_uint16 R;
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int16()
+{
+    DUMP_ENTRY(v_int16);
+    // typedef v_int16 R;
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+    DUMP_ENTRY(v_uint32);
+    // typedef v_uint32 R;
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int32()
+{
+    DUMP_ENTRY(v_int32);
+    // typedef v_int32 R;
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+    DUMP_ENTRY(v_uint64);
+    // typedef v_uint64 R;
+    TheTest<v_uint64>()
+        .test_loadstore()
+        ;
+}
+
+void test_hal_intrin_int64()
+{
+    DUMP_ENTRY(v_int64);
+    // typedef v_int64 R;
+    TheTest<v_int64>()
+        .test_loadstore()
+        ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    DUMP_ENTRY(v_float32);
+    // typedef v_float32 R;
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_float64()
+{
+    DUMP_ENTRY(v_float64);
+#if CV_SIMD_64F
+    // typedef v_float64 R;
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+
+#endif
+}
+
+#endif
+
 
 /*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16
 void test_hal_intrin_float16()
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index c9d329b2ff..6171bff802 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -6,6 +6,8 @@
 
 #if !defined(GAPI_STANDALONE)
 
+#include <opencv2/core/hal/intrin.hpp>
+#if CV_SIMD
 #include "gfluidcore_func.hpp"
 #include "gfluidcore_func.simd.hpp"
 
@@ -14,7 +16,6 @@
 #include "gfluidutils.hpp"
 
 #include <opencv2/core/cvdef.h>
-#include <opencv2/core/hal/intrin.hpp>
 
 #include <cmath>
 #include <cstdlib>
@@ -394,5 +395,5 @@ CONVERTTO_SCALED_SIMD(float, float)
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
-
+#endif // CV_SIMD
 #endif // !defined(GAPI_STANDALONE)
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 81aa098b64..aec03c0b50 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -6,7 +6,7 @@
 
 #pragma once
 
-#if !defined(GAPI_STANDALONE)
+#if !defined(GAPI_STANDALONE) && CV_SIMD
 
 #include <opencv2/core.hpp>
 
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index c5ebe30fe1..3b18944a0c 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -3612,6 +3612,7 @@ struct Luv2RGBinteger
         }
     }
 
+#if CV_SIMD
     inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
                                 v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
     {
@@ -3717,6 +3718,7 @@ struct Luv2RGBinteger
             z[k] = v_max(zero, v_min(base2, z[k]));
         }
     }
+#endif
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index 196a03b995..b5f73d873a 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -1038,6 +1038,7 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
     buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }
 
+#if CV_SIMD
 static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
                              v_int32 (&ruv)[4],
                              v_int32 (&guv)[4],
@@ -1067,6 +1068,7 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
         buv[k] = vshift + ub * uu[k];
     }
 }
+#endif
 
 static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
                                 uchar& r, uchar& g, uchar& b, uchar& a)
@@ -1079,6 +1081,7 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
     a = uchar(0xff);
 }
 
+#if CV_SIMD
 static inline void yRGBuvToRGBA(const v_uint8& vy,
                                 const v_int32 (&ruv)[4],
                                 const v_int32 (&guv)[4],
@@ -1117,6 +1120,7 @@ static inline void yRGBuvToRGBA(const v_uint8& vy,
     gg = v_pack_u(g0, g1);
     bb = v_pack_u(b0, b1);
 }
+#endif
 
 template<int bIdx, int dcn, bool is420>
 static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
@@ -1426,6 +1430,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
     return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
 }
 
+#if CV_SIMD
 static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
 {
     const int shifted16 = (16 << ITUR_BT_601_SHIFT);
@@ -1455,6 +1460,7 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
 
     return v_pack(y0, y1);
 }
+#endif
 
 static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
 {
@@ -1467,6 +1473,7 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
     v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
 }
 
+#if CV_SIMD
 static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
                               const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
 {
@@ -1514,6 +1521,7 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
     u = v_pack_u(u0, u1);
     v = v_pack_u(v0, v1);
 }
+#endif
 
 
 struct RGB8toYUV420pInvoker: public ParallelLoopBody
diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp
index 068b7d638f..90f0b20330 100644
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@@ -497,7 +497,6 @@ struct MinMax8u
 {
     typedef uchar value_type;
     typedef int arg_type;
-    enum { SIZE = 1 };
     arg_type load(const uchar* ptr) { return *ptr; }
     void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
     void operator()(arg_type& a, arg_type& b) const
@@ -511,7 +510,6 @@ struct MinMax16u
 {
     typedef ushort value_type;
     typedef int arg_type;
-    enum { SIZE = 1 };
     arg_type load(const ushort* ptr) { return *ptr; }
     void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
     void operator()(arg_type& a, arg_type& b) const
@@ -526,7 +524,6 @@ struct MinMax16s
 {
     typedef short value_type;
     typedef int arg_type;
-    enum { SIZE = 1 };
     arg_type load(const short* ptr) { return *ptr; }
     void store(short* ptr, arg_type val) { *ptr = (short)val; }
     void operator()(arg_type& a, arg_type& b) const
@@ -541,7 +538,6 @@ struct MinMax32f
 {
     typedef float value_type;
     typedef float arg_type;
-    enum { SIZE = 1 };
     arg_type load(const float* ptr) { return *ptr; }
     void store(float* ptr, arg_type val) { *ptr = val; }
     void operator()(arg_type& a, arg_type& b) const
@@ -552,14 +548,13 @@ struct MinMax32f
     }
 };
 
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 
 struct MinMaxVec8u
 {
     typedef uchar value_type;
-    typedef v_uint8x16 arg_type;
-    enum { SIZE = v_uint8x16::nlanes };
-    arg_type load(const uchar* ptr) { return v_load(ptr); }
+    typedef v_uint8 arg_type;
+    arg_type load(const uchar* ptr) { return vx_load(ptr); }
     void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); }
     void operator()(arg_type& a, arg_type& b) const
     {
@@ -567,27 +562,14 @@ struct MinMaxVec8u
         a = v_min(a, b);
         b = v_max(b, t);
     }
-#if CV_SIMD_WIDTH > 16
-    typedef v_uint8 warg_type;
-    enum { WSIZE = v_uint8::nlanes };
-    warg_type wload(const uchar* ptr) { return vx_load(ptr); }
-    void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); }
-    void operator()(warg_type& a, warg_type& b) const
-    {
-        warg_type t = a;
-        a = v_min(a, b);
-        b = v_max(b, t);
-    }
-#endif
 };
 
 
 struct MinMaxVec16u
 {
     typedef ushort value_type;
-    typedef v_uint16x8 arg_type;
-    enum { SIZE = v_uint16x8::nlanes };
-    arg_type load(const ushort* ptr) { return v_load(ptr); }
+    typedef v_uint16 arg_type;
+    arg_type load(const ushort* ptr) { return vx_load(ptr); }
     void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); }
     void operator()(arg_type& a, arg_type& b) const
     {
@@ -595,27 +577,14 @@ struct MinMaxVec16u
         a = v_min(a, b);
         b = v_max(b, t);
     }
-#if CV_SIMD_WIDTH > 16
-    typedef v_uint16 warg_type;
-    enum { WSIZE = v_uint16::nlanes };
-    warg_type wload(const ushort* ptr) { return vx_load(ptr); }
-    void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); }
-    void operator()(warg_type& a, warg_type& b) const
-    {
-        warg_type t = a;
-        a = v_min(a, b);
-        b = v_max(b, t);
-    }
-#endif
 };
 
 
 struct MinMaxVec16s
 {
     typedef short value_type;
-    typedef v_int16x8 arg_type;
-    enum { SIZE = v_int16x8::nlanes };
-    arg_type load(const short* ptr) { return v_load(ptr); }
+    typedef v_int16 arg_type;
+    arg_type load(const short* ptr) { return vx_load(ptr); }
     void store(short* ptr, const arg_type &val) { v_store(ptr, val); }
     void operator()(arg_type& a, arg_type& b) const
     {
@@ -623,27 +592,14 @@ struct MinMaxVec16s
         a = v_min(a, b);
         b = v_max(b, t);
     }
-#if CV_SIMD_WIDTH > 16
-    typedef v_int16 warg_type;
-    enum { WSIZE = v_int16::nlanes };
-    warg_type wload(const short* ptr) { return vx_load(ptr); }
-    void store(short* ptr, const warg_type &val) { v_store(ptr, val); }
-    void operator()(warg_type& a, warg_type& b) const
-    {
-        warg_type t = a;
-        a = v_min(a, b);
-        b = v_max(b, t);
-    }
-#endif
 };
 
 
 struct MinMaxVec32f
 {
     typedef float value_type;
-    typedef v_float32x4 arg_type;
-    enum { SIZE = v_float32x4::nlanes };
-    arg_type load(const float* ptr) { return v_load(ptr); }
+    typedef v_float32 arg_type;
+    arg_type load(const float* ptr) { return vx_load(ptr); }
     void store(float* ptr, const arg_type &val) { v_store(ptr, val); }
     void operator()(arg_type& a, arg_type& b) const
     {
@@ -651,18 +607,6 @@ struct MinMaxVec32f
         a = v_min(a, b);
         b = v_max(b, t);
     }
-#if CV_SIMD_WIDTH > 16
-    typedef v_float32 warg_type;
-    enum { WSIZE = v_float32::nlanes };
-    warg_type wload(const float* ptr) { return vx_load(ptr); }
-    void store(float* ptr, const warg_type &val) { v_store(ptr, val); }
-    void operator()(warg_type& a, warg_type& b) const
-    {
-        warg_type t = a;
-        a = v_min(a, b);
-        b = v_max(b, t);
-    }
-#endif
 };
 
 #else
@@ -683,9 +627,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
     typedef typename Op::value_type T;
     typedef typename Op::arg_type WT;
     typedef typename VecOp::arg_type VT;
-#if CV_SIMD_WIDTH > 16
-    typedef typename VecOp::warg_type WVT;
-#endif
 
     const T* src = _src.ptr<T>();
     T* dst = _dst.ptr<T>();
@@ -747,22 +688,12 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD_WIDTH > 16
-                for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE )
-                {
-                    WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn);
-                    WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn);
-                    WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn);
-
-                    vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
-                    vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
-                    vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
-                    vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
-                    vop(p4, p2); vop(p6, p4); vop(p4, p2);
-                    vop.store(dst+j, p4);
-                }
+#if CV_SIMD || CV_SIMD_SCALABLE
+                int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
+#else
+                int nlanes = 1;
 #endif
-                for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
+                for( ; j <= size.width - nlanes - cn; j += nlanes )
                 {
                     VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
                     VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
@@ -862,79 +793,43 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD_WIDTH > 16
-                for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE )
-                {
-                    WVT p[25];
-                    for( k = 0; k < 5; k++ )
-                    {
-                        const T* rowk = row[k];
-                        p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn);
-                        p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn);
-                        p[k*5+4] = vop.wload(rowk+j+cn*2);
-                    }
-
-                    vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
-                    vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
-                    vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
-                    vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
-                    vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
-                    vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
-                    vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
-                    vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
-                    vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
-                    vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
-                    vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
-                    vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
-                    vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
-                    vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
-                    vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
-                    vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
-                    vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
-                    vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
-                    vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
-                    vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
-                    vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
-                    vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
-                    vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
-                    vop.store(dst+j, p[12]);
-                }
+#if CV_SIMD || CV_SIMD_SCALABLE
+                int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
+#else
+                int nlanes = 1;
 #endif
-                for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
+                for( ; j <= size.width - nlanes - cn*2; j += nlanes )
                 {
-                    VT p[25];
-                    for( k = 0; k < 5; k++ )
-                    {
-                        const T* rowk = row[k];
-                        p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
-                        p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
-                        p[k*5+4] = vop.load(rowk+j+cn*2);
-                    }
-
-                    vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
-                    vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
-                    vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
-                    vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
-                    vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
-                    vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
-                    vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
-                    vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
-                    vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
-                    vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
-                    vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
-                    vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
-                    vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
-                    vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
-                    vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
-                    vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
-                    vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
-                    vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
-                    vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
-                    vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
-                    vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
-                    vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
-                    vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
-                    vop.store(dst+j, p[12]);
+                    VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2);
+                    VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1);
+                    VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0);
+                    VT p3 = vop.load(row[0]+j+cn*1), p8 = vop.load(row[1]+j+cn*1), p13 = vop.load(row[2]+j+cn*1), p18 = vop.load(row[3]+j+cn*1), p23 = vop.load(row[4]+j+cn*1);
+                    VT p4 = vop.load(row[0]+j+cn*2), p9 = vop.load(row[1]+j+cn*2), p14 = vop.load(row[2]+j+cn*2), p19 = vop.load(row[3]+j+cn*2), p24 = vop.load(row[4]+j+cn*2);
+
+                    vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
+                    vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
+                    vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
+                    vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
+                    vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
+                    vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
+                    vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
+                    vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
+                    vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
+                    vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
+                    vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
+                    vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
+                    vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
+                    vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
+                    vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
+                    vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
+                    vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
+                    vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
+                    vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
+                    vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
+                    vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
+                    vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
+                    vop(p7, p11); vop(p11, p13); vop(p11, p12);
+                    vop.store(dst+j, p12);
                 }
 
                 limit = size.width;
diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake
index 62d9e293d2..2efd67ad93 100644
--- a/platforms/linux/riscv64-clang.toolchain.cmake
+++ b/platforms/linux/riscv64-clang.toolchain.cmake
@@ -22,6 +22,10 @@ set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w
 
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}  -O2")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}  -O2")
+OPTION(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON) # Enabled by default
+IF(RISCV_RVV_SCALABLE)
+    ADD_DEFINITIONS(-DCV_RVV_SCALABLE)
+ENDIF()
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)

From b3269b08a19d1da49cf63754d92bdbd39e22c568 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Wed, 20 Jul 2022 19:25:39 +0900
Subject: [PATCH 140/178] neon: add dotprod dispatch implementation      * read
 vector at runtime      * add enum

---
 cmake/OpenCVCompilerOptimizations.cmake       |  7 +-
 cmake/checks/cpu_dotprod.cpp                  | 24 +++++++
 modules/core/CMakeLists.txt                   |  2 +-
 .../include/opencv2/core/cv_cpu_dispatch.h    |  4 ++
 .../core/include/opencv2/core/cv_cpu_helper.h | 21 ++++++
 modules/core/include/opencv2/core/cvdef.h     |  2 +
 .../include/opencv2/core/hal/intrin_neon.hpp  | 65 +++++++++----------
 modules/core/src/system.cpp                   | 19 ++++++
 8 files changed, 106 insertions(+), 38 deletions(-)
 create mode 100644 cmake/checks/cpu_dotprod.cpp

diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 058443821a..1e0e812afc 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -46,7 +46,7 @@
 
 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
 list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
-list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@@ -326,6 +326,7 @@ if(X86 OR X86_64)
 elseif(ARM OR AARCH64)
   ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
   ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
   if(NOT AARCH64)
     ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
     if(NOT MSVC)
@@ -337,9 +338,11 @@ elseif(ARM OR AARCH64)
     endif()
     ocv_update(CPU_FP16_IMPLIES "NEON")
   else()
-    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
     ocv_update(CPU_NEON_FLAGS_ON "")
     ocv_update(CPU_FP16_IMPLIES "NEON")
+    ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
+    ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
     set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
   endif()
 elseif(MIPS)
diff --git a/cmake/checks/cpu_dotprod.cpp b/cmake/checks/cpu_dotprod.cpp
new file mode 100644
index 0000000000..4f39c50659
--- /dev/null
+++ b/cmake/checks/cpu_dotprod.cpp
@@ -0,0 +1,24 @@
+#include <stdio.h>
+
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#include "arm_neon.h"
+int test()
+{
+    const unsigned int src[] = { 0, 0, 0, 0 };
+    unsigned int dst[4];
+    uint32x4_t v_src = *(uint32x4_t*)src;
+    uint8x16_t v_m0 = *(uint8x16_t*)src;
+    uint8x16_t v_m1 = *(uint8x16_t*)src;
+    uint32x4_t v_dst = vdotq_u32(v_src, v_m0, v_m1);
+    *(uint32x4_t*)dst = v_dst;
+    return (int)dst[0];
+}
+#else
+#error "DOTPROD is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 568a8afef1..0163cf570c 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
 ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
 ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
+ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
 ocv_add_dispatched_file(mean SSE2 AVX2)
 ocv_add_dispatched_file(merge SSE2 AVX2)
 ocv_add_dispatched_file(split SSE2 AVX2)
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index e92798d6c9..758f905a64 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -79,6 +79,10 @@
 #  endif
 #  define CV_FP16 1
 #endif
+#ifdef CV_CPU_COMPILE_NEON_DOTPROD
+#  include <arm_neon.h>
+#  define CV_NEON_DOT 1
+#endif
 #ifdef CV_CPU_COMPILE_AVX2
 #  include <immintrin.h>
 #  define CV_AVX2 1
diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h
index aaa89ed415..25cf3477c3 100644
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -420,6 +420,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 1
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#else
+#  define CV_TRY_NEON_DOTPROD 0
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...)  CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 1
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index c2cdcad075..9102316968 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -282,6 +282,7 @@ namespace cv {
 #define CV_CPU_AVX_5124FMAPS    27
 
 #define CV_CPU_NEON             100
+#define CV_CPU_NEON_DOTPROD     101
 
 #define CV_CPU_MSA              150
 
@@ -334,6 +335,7 @@ enum CpuFeatures {
     CPU_AVX_5124FMAPS   = 27,
 
     CPU_NEON            = 100,
+    CPU_NEON_DOTPROD    = 101,
 
     CPU_MSA             = 150,
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index e17972a3fc..5792694a40 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_NEON_AARCH64 0
 #endif
 
-// TODO
-#define CV_NEON_DOT 0
 
 //////////// Utils ////////////
 
@@ -667,11 +665,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
 }
 
 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
+} \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
     const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
     const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
     const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
@@ -687,23 +696,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
                               vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
     return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
     return v_dotprod_expand(a, b) + c;
-#endif
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
     int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
     int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
     int16x8_t uzp1, uzp2;
@@ -712,18 +713,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     int16x4_t uzpl1, uzpl2;
     _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
     return v_int32x4(vaddl_s16(uzpl1, uzpl2));
-#endif
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                   const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
     return v_dotprod_expand(a, b) + c;
-#endif
 }
-
+#endif
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
@@ -832,45 +828,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 }
 
 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return v_dotprod_expand(a, b); \
+} \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return v_dotprod_expand(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
     uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
     uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
     uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
     uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
     return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
     return v_dotprod_expand_fast(a, b) + c;
-#endif
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
     int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
     prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
     return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
-#endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
     return v_dotprod_expand_fast(a, b) + c;
-#endif
 }
+#endif
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index adb957908d..3507009196 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -411,6 +411,7 @@ struct HWFeatures
         g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS";
 
         g_hwFeatureNames[CPU_NEON] = "NEON";
+        g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
 
         g_hwFeatureNames[CPU_VSX] = "VSX";
         g_hwFeatureNames[CPU_VSX3] = "VSX3";
@@ -555,6 +556,24 @@ struct HWFeatures
     #ifdef __aarch64__
         have[CV_CPU_NEON] = true;
         have[CV_CPU_FP16] = true;
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf64_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(auxv);
+
+            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
     #elif defined __arm__ && defined __ANDROID__
       #if defined HAVE_CPUFEATURES
         CV_LOG_INFO(NULL, "calling android_getCpuFeatures() ...");

From 98c33c605dfb09bd4b974cc35db87c5077e677b3 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Wed, 20 Jul 2022 19:02:16 +0800
Subject: [PATCH 141/178] batchsize dynamic is set to index 0.

---
 modules/dnn/src/onnx/onnx_importer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 7390d0307a..e90581eeb5 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1303,7 +1303,7 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
     layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
 
     // Set batchsize dim as dynamic to be compatible with batch size >= 2.
-    if (targetShape[0] == 1 && targetShape.size() > 1)
+    if (targetShape.size() > 1)
     {
         std::vector<int> dynamicAxes = {0};  // The index of batchsize dim is 0.
         std::vector<int> inputIndices = {0};

From e2bfe0ce76fdc69f0a0bfbd53d8a7f3a6981d1d3 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Thu, 21 Jul 2022 03:23:57 +0000
Subject: [PATCH 142/178] Use "#if" instead of "#ifdef" for CV_SIMD128.

---
 .../src/layers/fast_convolution/depthwise_convolution.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
index c98c3d6549..4eb47c46b2 100644
--- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -19,7 +19,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                            int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
                            int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
-#ifdef CV_SIMD128
+#if CV_SIMD128
     v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
 
     v_float32x4 w0 = v_setall_f32(
@@ -44,7 +44,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
     int dy0 = 1;
     for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
     {
-#ifdef CV_SIMD128
+#if CV_SIMD128
         dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
               ? 3 : 1;
 #endif
@@ -103,7 +103,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
             if (x0 == W0)
                 break;
             x1 = inner_xright;
-#ifdef CV_SIMD128
+#if CV_SIMD128
             if (useSIMD)
             {
                 if (is3x3)

From 3e3b53f815df186ed06472d6e0164743f2b16389 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Thu, 21 Jul 2022 08:14:32 +0000
Subject: [PATCH 143/178] Fix compile errors when all SIMD is disabled.

---
 .../core/include/opencv2/core/hal/intrin.hpp  | 232 +-------------
 .../include/opencv2/core/hal/intrin_cpp.hpp   |   1 +
 .../core/include/opencv2/core/saturate.hpp    |   1 +
 modules/core/test/test_intrin_utils.hpp       | 294 +++++++++---------
 4 files changed, 154 insertions(+), 374 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 0041030dad..d79c995a27 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -695,237 +695,15 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
-#if CV_SIMD
+#if !CV_SIMD_SCALABLE
     // Compatibility layer
-    #define CV_SIMD_SCALABLE 0
-    #define CV_SIMD_SCALABLE_64F 0
 
-    template <class T>
-    struct VTraits;
-#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
-    template <>
-    struct VTraits<v_uint8>
-    {
-        static inline int vlanes() { return v_uint8::nlanes; }
-        enum { nlanes = 64, max_nlanes = nlanes };
-        using lane_type = uchar;
-    };
-    template <>
-    struct VTraits<v_int8>
-    {
-        static inline int vlanes() { return v_int8::nlanes; }
-        enum { nlanes = 64, max_nlanes = nlanes };
-        using lane_type = schar;
-    };
-    template <>
-    struct VTraits<v_uint16>
-    {
-        static inline int vlanes() { return v_uint16::nlanes; }
-        enum { nlanes = 32, max_nlanes = nlanes };
-        using lane_type = ushort;
-    };
-    template <>
-    struct VTraits<v_int16>
-    {
-        static inline int vlanes() { return v_int16::nlanes;  }
-        enum { nlanes = 32, max_nlanes = nlanes };
-        using lane_type = short;
-    };
-    template <>
-    struct VTraits<v_uint32>
-    {
-        static inline int vlanes() { return v_uint32::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = uint;
-    };
-    template <>
-    struct VTraits<v_int32>
-    {
-        static inline int vlanes() { return v_int32::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = int;
+    template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
     };
 
-    template <>
-    struct VTraits<v_float32>
-    {
-        static inline int vlanes() { return v_float32::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = float;
-    };
-    template <>
-    struct VTraits<v_uint64>
-    {
-        static inline int vlanes() { return v_uint64::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = uint64;
-    };
-    template <>
-    struct VTraits<v_int64>
-    {
-        static inline int vlanes() { return v_int64::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = int64;
-    };
-    #if CV_SIMD_64F
-    template <>
-    struct VTraits<v_float64>
-    {
-        static inline int vlanes() { return v_float64::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = double;
-    };
-    #endif
-#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
-    template <>
-    struct VTraits<v_uint8>
-    {
-        static inline int vlanes() { return v_uint8::nlanes; }
-        enum { nlanes = 32, max_nlanes = nlanes };
-        using lane_type = uchar;
-    };
-    template <>
-    struct VTraits<v_int8>
-    {
-        static inline int vlanes() { return v_int8::nlanes; }
-        enum { nlanes = 32, max_nlanes = nlanes };
-        using lane_type = schar;
-    };
-    template <>
-    struct VTraits<v_uint16>
-    {
-        static inline int vlanes() { return v_uint16::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = ushort;
-    };
-    template <>
-    struct VTraits<v_int16>
-    {
-        static inline int vlanes() { return v_int16::nlanes;  }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = short;
-    };
-    template <>
-    struct VTraits<v_uint32>
-    {
-        static inline int vlanes() { return v_uint32::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = uint;
-    };
-    template <>
-    struct VTraits<v_int32>
-    {
-        static inline int vlanes() { return v_int32::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = int;
-    };
-
-    template <>
-    struct VTraits<v_float32>
-    {
-        static inline int vlanes() { return v_float32::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = float;
-    };
-    template <>
-    struct VTraits<v_uint64>
-    {
-        static inline int vlanes() { return v_uint64::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = uint64;
-    };
-    template <>
-    struct VTraits<v_int64>
-    {
-        static inline int vlanes() { return v_int64::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = int64;
-    };
-    #if CV_SIMD_64F
-    template <>
-    struct VTraits<v_float64>
-    {
-        static inline int vlanes() { return v_float64::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = double;
-    };
-    #endif
-#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
-    template <>
-    struct VTraits<v_uint8>
-    {
-        static inline int vlanes() { return v_uint8::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = uchar;
-    };
-    template <>
-    struct VTraits<v_int8>
-    {
-        static inline int vlanes() { return v_int8::nlanes; }
-        enum { nlanes = 16, max_nlanes = nlanes };
-        using lane_type = schar;
-    };
-    template <>
-    struct VTraits<v_uint16>
-    {
-        static inline int vlanes() { return v_uint16::nlanes; }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = ushort;
-    };
-    template <>
-    struct VTraits<v_int16>
-    {
-        static inline int vlanes() { return v_int16::nlanes;  }
-        enum { nlanes = 8, max_nlanes = nlanes };
-        using lane_type = short;
-    };
-    template <>
-    struct VTraits<v_uint32>
-    {
-        static inline int vlanes() { return v_uint32::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = uint;
-    };
-    template <>
-    struct VTraits<v_int32>
-    {
-        static inline int vlanes() { return v_int32::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = int;
-    };
-
-    template <>
-    struct VTraits<v_float32>
-    {
-        static inline int vlanes() { return v_float32::nlanes; }
-        enum { nlanes = 4, max_nlanes = nlanes };
-        using lane_type = float;
-    };
-    template <>
-    struct VTraits<v_uint64>
-    {
-        static inline int vlanes() { return v_uint64::nlanes; }
-        enum { nlanes = 2, max_nlanes = nlanes };
-        using lane_type = uint64;
-    };
-    template <>
-    struct VTraits<v_int64>
-    {
-        static inline int vlanes() { return v_int64::nlanes; }
-        enum { nlanes = 2, max_nlanes = nlanes };
-        using lane_type = int64;
-    };
-    #if CV_SIMD_64F
-    template <>
-    struct VTraits<v_float64>
-    {
-        static inline int vlanes() { return v_float64::nlanes; }
-        enum { nlanes = 2, max_nlanes = nlanes };
-        using lane_type = double;
-    };
-    #endif
-#endif
-
     #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
     inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
     { \
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 46222140e6..633a58bca4 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -48,6 +48,7 @@
 #include <limits>
 #include <cstring>
 #include <algorithm>
+#include "opencv2/core/utility.hpp"
 #include "opencv2/core/saturate.hpp"
 
 //! @cond IGNORED
diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp
index 8127e3d9ef..e0cc965ab6 100644
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -46,6 +46,7 @@
 #define OPENCV_CORE_SATURATE_HPP
 
 #include "opencv2/core/cvdef.h"
+#include <climits>
 #include "opencv2/core/fast_math.hpp"
 
 namespace cv
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 5d4442f111..7477a004ef 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -25,15 +25,30 @@ void test_hal_intrin_float16();
 template <typename R> struct Data;
 template <int N> struct initializer;
 
-#if CV_SIMD
+#if CV_SIMD_SCALABLE
+template <> struct initializer<128>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
+        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
+        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63],
+        d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79],
+        d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95],
+        d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111],
+        d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]});
+    }
+};
+
 template <> struct initializer<64>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
         d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
         d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]);
+        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]});
     }
 };
 
@@ -41,8 +56,8 @@ template <> struct initializer<32>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]);
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]});
     }
 };
 
@@ -50,7 +65,7 @@ template <> struct initializer<16>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]});
     }
 };
 
@@ -58,7 +73,7 @@ template <> struct initializer<8>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]});
     }
 };
 
@@ -66,7 +81,7 @@ template <> struct initializer<4>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1], d[2], d[3]);
+        return v_load({d[0], d[1], d[2], d[3]});
     }
 };
 
@@ -74,35 +89,19 @@ template <> struct initializer<2>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return R(d[0], d[1]);
+        return v_load({d[0], d[1]});
     }
 };
 
 #else
-
-template <> struct initializer<128>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
-        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63],
-        d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79],
-        d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95],
-        d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111],
-        d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]});
-    }
-};
-
 template <> struct initializer<64>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
         d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
         d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]});
+        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]);
     }
 };
 
@@ -110,8 +109,8 @@ template <> struct initializer<32>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]});
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]);
     }
 };
 
@@ -119,7 +118,7 @@ template <> struct initializer<16>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]});
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
     }
 };
 
@@ -127,7 +126,7 @@ template <> struct initializer<8>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]});
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
     }
 };
 
@@ -135,7 +134,7 @@ template <> struct initializer<4>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1], d[2], d[3]});
+        return R(d[0], d[1], d[2], d[3]);
     }
 };
 
@@ -143,7 +142,7 @@ template <> struct initializer<2>
 {
     template <typename R> static R init(const Data<R> & d)
     {
-        return v_load({d[0], d[1]});
+        return R(d[0], d[1]);
     }
 };
 #endif
@@ -1726,8 +1725,122 @@ template<typename R> struct TheTest
 #endif
 };
 
+#if CV_SIMD_SCALABLE //Temporary
+#define DUMP_ENTRY(type) printf("SIMD: %s\n", CV__TRACE_FUNCTION);
+
+
+//=============  8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+    DUMP_ENTRY(v_uint8);
+    // typedef v_uint8 R;
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int8()
+{
+    DUMP_ENTRY(v_int8);
+    // typedef v_int8 R;
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+    DUMP_ENTRY(v_uint16);
+    // typedef v_uint16 R;
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int16()
+{
+    DUMP_ENTRY(v_int16);
+    // typedef v_int16 R;
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+    DUMP_ENTRY(v_uint32);
+    // typedef v_uint32 R;
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_int32()
+{
+    DUMP_ENTRY(v_int32);
+    // typedef v_int32 R;
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+    DUMP_ENTRY(v_uint64);
+    // typedef v_uint64 R;
+    TheTest<v_uint64>()
+        .test_loadstore()
+        ;
+}
+
+void test_hal_intrin_int64()
+{
+    DUMP_ENTRY(v_int64);
+    // typedef v_int64 R;
+    TheTest<v_int64>()
+        .test_loadstore()
+        ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    DUMP_ENTRY(v_float32);
+    // typedef v_float32 R;
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+}
+
+void test_hal_intrin_float64()
+{
+    DUMP_ENTRY(v_float64);
+#if CV_SIMD_64F
+    // typedef v_float64 R;
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_min_max()
+        ;
+
+#endif
+}
+
+#else
 
-#if CV_SIMD
 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION);
 //=============  8-bit integer =====================================================================
 
@@ -2075,119 +2188,6 @@ void test_hal_intrin_float16()
     std::cout << "SKIP: CV_FP16 is not available" << std::endl;
 #endif
 }
-#elif CV_SIMD_SCALABLE //Temporary
-#define DUMP_ENTRY(type) printf("SIMD: %s\n", CV__TRACE_FUNCTION);
-
-
-//=============  8-bit integer =====================================================================
-
-void test_hal_intrin_uint8()
-{
-    DUMP_ENTRY(v_uint8);
-    // typedef v_uint8 R;
-    TheTest<v_uint8>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-void test_hal_intrin_int8()
-{
-    DUMP_ENTRY(v_int8);
-    // typedef v_int8 R;
-    TheTest<v_int8>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-//============= 16-bit integer =====================================================================
-
-void test_hal_intrin_uint16()
-{
-    DUMP_ENTRY(v_uint16);
-    // typedef v_uint16 R;
-    TheTest<v_uint16>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-void test_hal_intrin_int16()
-{
-    DUMP_ENTRY(v_int16);
-    // typedef v_int16 R;
-    TheTest<v_int16>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-//============= 32-bit integer =====================================================================
-
-void test_hal_intrin_uint32()
-{
-    DUMP_ENTRY(v_uint32);
-    // typedef v_uint32 R;
-    TheTest<v_uint32>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-void test_hal_intrin_int32()
-{
-    DUMP_ENTRY(v_int32);
-    // typedef v_int32 R;
-    TheTest<v_int32>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-//============= 64-bit integer =====================================================================
-
-void test_hal_intrin_uint64()
-{
-    DUMP_ENTRY(v_uint64);
-    // typedef v_uint64 R;
-    TheTest<v_uint64>()
-        .test_loadstore()
-        ;
-}
-
-void test_hal_intrin_int64()
-{
-    DUMP_ENTRY(v_int64);
-    // typedef v_int64 R;
-    TheTest<v_int64>()
-        .test_loadstore()
-        ;
-}
-
-//============= Floating point =====================================================================
-void test_hal_intrin_float32()
-{
-    DUMP_ENTRY(v_float32);
-    // typedef v_float32 R;
-    TheTest<v_float32>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-}
-
-void test_hal_intrin_float64()
-{
-    DUMP_ENTRY(v_float64);
-#if CV_SIMD_64F
-    // typedef v_float64 R;
-    TheTest<v_float64>()
-        .test_loadstore()
-        .test_min_max()
-        ;
-
-#endif
-}
 
 #endif
 

From a016f6022c2786f827a811731f2bde605ba54526 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 21 Jul 2022 17:47:55 +0300
Subject: [PATCH 144/178] Android GHA workflow for 3.4 branch

---
 .github/workflows/PR-3.4.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index fcfa924e96..d528f861e0 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -23,3 +23,6 @@ jobs:
 
   iOS:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-iOS.yaml@main
+
+  Android:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-Android.yaml@main

From 71e33265f508a9290946ff31053912c751bad772 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 21 Jul 2022 17:59:16 +0300
Subject: [PATCH 145/178] Android GHA workflow for 4.x branch

---
 .github/workflows/PR-4.x.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 44970006f9..37eb10f4ce 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -24,5 +24,8 @@ jobs:
   iOS:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-iOS.yaml@main
 
+  Android:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-Android.yaml@main
+
   TIM-VX:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main

From f7292022727e091251f9c8f69a90d90020613b35 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Sun, 24 Jul 2022 11:30:08 +0300
Subject: [PATCH 146/178] core: remove unnecessary pointer cleanup in
 BufferArea

---
 modules/core/src/buffer_area.cpp | 3 +--
 modules/core/test/test_utils.cpp | 7 -------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/modules/core/src/buffer_area.cpp b/modules/core/src/buffer_area.cpp
index 2fe9d782ae..91e6cb966f 100644
--- a/modules/core/src/buffer_area.cpp
+++ b/modules/core/src/buffer_area.cpp
@@ -29,8 +29,7 @@ public:
     }
     void cleanup() const
     {
-        CV_Assert(ptr && *ptr);
-        *ptr = 0;
+        CV_DbgAssert(ptr);
         if (raw_mem)
             fastFree(raw_mem);
     }
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index d8789ddfc2..611c4b2106 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -403,9 +403,6 @@ TEST_P(BufferArea, basic)
             EXPECT_EQ((double)0, dbl_ptr[i]);
         }
     }
-    EXPECT_TRUE(int_ptr == NULL);
-    EXPECT_TRUE(uchar_ptr == NULL);
-    EXPECT_TRUE(dbl_ptr == NULL);
 }
 
 TEST_P(BufferArea, align)
@@ -442,10 +439,6 @@ TEST_P(BufferArea, align)
             }
         }
     }
-    for (size_t i = 0; i < CNT; ++i)
-    {
-        EXPECT_TRUE(buffers[i] == NULL);
-    }
 }
 
 TEST_P(BufferArea, default_align)

From 2bd72af2ef06f73e8fb8c0d4915825aeccc5bf03 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Sun, 24 Jul 2022 17:15:13 +0800
Subject: [PATCH 147/178] Merge pull request #22292 from hanliutong:fix

[GSoC] Fix compilation errors and warnings when using MSVC on Windows.

* Pass reference of the argument.

* Add some cast to suppress warnings.
---
 modules/core/include/opencv2/core/hal/intrin.hpp     | 12 ++++++------
 .../include/opencv2/core/hal/intrin_rvv_scalable.hpp |  6 +++---
 modules/core/test/test_intrin_utils.hpp              |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index d79c995a27..a04de3a12d 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -714,7 +714,7 @@ namespace CV__SIMD_NAMESPACE {
         return a - b; \
     } \
     template<typename... Args> \
-    inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
         return v_add(f1 + f2, vf...); \
     }
 
@@ -765,7 +765,7 @@ namespace CV__SIMD_NAMESPACE {
         return a * b; \
     } \
     template<typename... Args> \
-    inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
         return v_mul(f1 * f2, vf...); \
     }
     OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
@@ -820,7 +820,7 @@ namespace CV__SIMD_NAMESPACE {
 
     //////////// get0 ////////////
     #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
-    inline _Tp v_get0(v_##_Tpvec v) \
+    inline _Tp v_get0(const v_##_Tpvec& v) \
     { \
         return v.get0(); \
     }
@@ -839,7 +839,7 @@ namespace CV__SIMD_NAMESPACE {
     #endif
 
     #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
-    inline _Tp v_extract_highest(_Tpvec v) \
+    inline _Tp v_extract_highest(const _Tpvec& v) \
     { \
         return v_extract_n<vl-1>(v); \
     }
@@ -858,7 +858,7 @@ namespace CV__SIMD_NAMESPACE {
     #endif
 
     #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
-    inline _Tpvec v_broadcast_highest(_Tpvec v) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
     { \
         return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
     }
@@ -868,7 +868,7 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BROADCAST(v_float32)
 
 
-#endif //CV_SIMD
+#endif //!CV_SIMD_SCALABLE
 
 //! @cond IGNORED
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index b984411436..30c7524699 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -123,7 +123,7 @@ struct VTraits<v_float64>
 
 //////////// get0 ////////////
 #define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
-inline _Tp v_get0(v_##_Tpvec v) \
+inline _Tp v_get0(const v_##_Tpvec& v) \
 { \
     return vmv_x(v); \
 }
@@ -137,12 +137,12 @@ OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
 OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
 
-inline float v_get0(v_float32 v) \
+inline float v_get0(const v_float32& v) \
 { \
     return vfmv_f(v); \
 }
 #if CV_SIMD_SCALABLE_64F
-inline double v_get0(v_float64 v) \
+inline double v_get0(const v_float64& v) \
 { \
     return vfmv_f(v); \
 }
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 7477a004ef..763702bf38 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1004,7 +1004,7 @@ template<typename R> struct TheTest
     TheTest & test_reduce()
     {
         Data<R> dataA;
-        LaneType min = VTraits<R>::vlanes(), max = 0;
+        LaneType min = (LaneType)VTraits<R>::vlanes(), max = 0;
         int sum = 0;
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
@@ -1016,9 +1016,9 @@ template<typename R> struct TheTest
         EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(a));
         EXPECT_EQ((LaneType)max, (LaneType)v_reduce_max(a));
         EXPECT_EQ((int)(sum), (int)v_reduce_sum(a));
-        dataA[0] += VTraits<R>::vlanes();
+        dataA[0] += (LaneType)VTraits<R>::vlanes();
         R an = dataA;
-        min = VTraits<R>::vlanes();
+        min = (LaneType)VTraits<R>::vlanes();
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             min = std::min<LaneType>(min, dataA[i]);
@@ -1029,7 +1029,7 @@ template<typename R> struct TheTest
 
     TheTest & test_reduce_sad()
     {
-        Data<R> dataA, dataB(VTraits<R>::vlanes()/2);
+        Data<R> dataA, dataB((LaneType)VTraits<R>::vlanes() /2);
         R a = dataA;
         R b = dataB;
         uint sum = 0;

From 85b0b0cd7701f03f8053a59e17cf8c6f08dffdfc Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 24 Jul 2022 12:19:37 +0300
Subject: [PATCH 148/178] build: add missing GCC condition for pragma

---
 modules/features2d/src/sift.simd.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index dfe113b9b3..8a038612c4 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -829,14 +829,19 @@ else // CV_8U
         v_pack_store(dst + k, __pack01);
     }
 #endif
+
+#if defined(__GNUC__) && __GNUC__ >= 9
 // avoid warning "iteration 7 invokes undefined behavior" on Linux ARM64
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
+#endif
     for( ; k < len; k++ )
     {
         dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
     }
+#if defined(__GNUC__) && __GNUC__ >= 9
 #pragma GCC diagnostic pop
+#endif
 }
 #else
     float* dst = dstMat.ptr<float>(row);

From f8597fc1506581a996fb4ef385a57816b87ab077 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 24 Jul 2022 12:39:25 +0300
Subject: [PATCH 149/178] imgproc: fixup leftovers of int->int64_t conversion

---
 modules/imgproc/src/drawing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 81d56075ca..0e938a644d 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -1040,7 +1040,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes,
 *                                Polygons filling                                        *
 \****************************************************************************************/
 
-static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, int pix_size)
+static inline void ICV_HLINE_X(uchar* ptr, int64_t xl, int64_t xr, const uchar* color, int pix_size)
 {
     uchar* hline_min_ptr = (uchar*)(ptr) + (xl)*(pix_size);
     uchar* hline_end_ptr = (uchar*)(ptr) + (xr+1)*(pix_size);
@@ -1065,7 +1065,7 @@ static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, i
 }
 //end ICV_HLINE_X()
 
-static inline void ICV_HLINE(uchar* ptr, int xl, int xr, const void* color, int pix_size)
+static inline void ICV_HLINE(uchar* ptr, int64_t xl, int64_t xr, const void* color, int pix_size)
 {
   ICV_HLINE_X(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
 }

From b8106e4ba4c29ca01e2b875baa979218bb9084d0 Mon Sep 17 00:00:00 2001
From: Giles Payne <gilespayne@gmail.com>
Date: Mon, 25 Jul 2022 20:57:53 +0900
Subject: [PATCH 150/178] Fix bug in Objective-C/Swift [Mat initWithSize:**]
 functions

---
 modules/core/misc/objc/common/Mat.mm      | 4 ++--
 modules/core/misc/objc/test/MatTest.swift | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/modules/core/misc/objc/common/Mat.mm b/modules/core/misc/objc/common/Mat.mm
index ab5e1de6f9..a160d98f67 100644
--- a/modules/core/misc/objc/common/Mat.mm
+++ b/modules/core/misc/objc/common/Mat.mm
@@ -98,7 +98,7 @@ static bool updateIdx(cv::Mat* mat, std::vector<int>& indices, size_t inc) {
 - (instancetype)initWithSize:(Size2i*)size type:(int)type {
     self = [super init];
     if (self) {
-        _nativePtr = new cv::Mat(size.width, size.height, type);
+        _nativePtr = new cv::Mat(size.height, size.width, type);
     }
     return self;
 }
@@ -128,7 +128,7 @@ static bool updateIdx(cv::Mat* mat, std::vector<int>& indices, size_t inc) {
     self = [super init];
     if (self) {
         cv::Scalar scalerTemp(scalar.val[0].doubleValue, scalar.val[1].doubleValue, scalar.val[2].doubleValue, scalar.val[3].doubleValue);
-        _nativePtr = new cv::Mat(size.width, size.height, type, scalerTemp);
+        _nativePtr = new cv::Mat(size.height, size.width, type, scalerTemp);
     }
     return self;
 }
diff --git a/modules/core/misc/objc/test/MatTest.swift b/modules/core/misc/objc/test/MatTest.swift
index 87f99fb84a..59504781c9 100644
--- a/modules/core/misc/objc/test/MatTest.swift
+++ b/modules/core/misc/objc/test/MatTest.swift
@@ -17,6 +17,12 @@ class MatTests: OpenCVTestCase {
         super.tearDown()
     }
 
+    func testInitWithSize() {
+        let size = Size(width: 7, height: 9)
+        let mat = Mat(size: size, type: CvType.CV_8U)
+        assertSizeEquals(size, mat.size())
+    }
+
     func testAdjustROI() throws {
         let roi = gray0.submat(rowStart: 3, rowEnd: 5, colStart: 7, colEnd: 10)
         let originalroi = roi.clone()

From fc3e393516a7637425f73a3ce40f7950329bf3e3 Mon Sep 17 00:00:00 2001
From: hzc <1591563365@qq.com>
Date: Tue, 26 Jul 2022 19:35:34 +0800
Subject: [PATCH 151/178] videoio: add support for obsensor (Orbbec RGB-D
 Camera ) (#22196)

* videoio: add support for obsensor (Orbbec RGB-D Camera )

* obsensor: code format issues fixed and some code optimized

* obsensor: fix typo and format issues

* obsensor: fix crosses initialization error
---
 CMakeLists.txt                                |   3 +
 modules/videoio/CMakeLists.txt                |  24 +
 modules/videoio/cmake/detect_obsensor.cmake   |  18 +
 modules/videoio/cmake/init.cmake              |   1 +
 modules/videoio/include/opencv2/videoio.hpp   |  30 ++
 modules/videoio/src/cap_interface.hpp         |   2 +
 .../obsensor_stream_channel_interface.hpp     | 103 ++++
 .../obsensor_stream_channel_msmf.cpp          | 505 ++++++++++++++++++
 .../obsensor_stream_channel_msmf.hpp          | 180 +++++++
 .../obsensor_stream_channel_v4l2.cpp          | 379 +++++++++++++
 .../obsensor_stream_channel_v4l2.hpp          |  90 ++++
 .../obsensor_uvc_stream_channel.cpp           | 265 +++++++++
 .../obsensor_uvc_stream_channel.hpp           |  96 ++++
 modules/videoio/src/cap_obsensor_capture.cpp  | 150 ++++++
 modules/videoio/src/cap_obsensor_capture.hpp  |  66 +++
 modules/videoio/src/videoio_registry.cpp      |   7 +-
 samples/cpp/videocapture_obsensor.cpp         |  74 +++
 17 files changed, 1992 insertions(+), 1 deletion(-)
 create mode 100644 modules/videoio/cmake/detect_obsensor.cmake
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
 create mode 100644 modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
 create mode 100644 modules/videoio/src/cap_obsensor_capture.cpp
 create mode 100644 modules/videoio/src/cap_obsensor_capture.hpp
 create mode 100644 samples/cpp/videocapture_obsensor.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a06a51ad9..43b6023242 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -459,6 +459,9 @@ OCV_OPTION(WITH_ONNX "Include Microsoft ONNX Runtime support" OFF
 OCV_OPTION(WITH_TIMVX "Include Tim-VX support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_TIMVX)
+OCV_OPTION(WITH_OBSENSOR "Include obsensor support (Orbbec RGB-D modules: Astra+/Femto)" ON
+  VISIBLE_IF (WIN32 AND NOT ARM AND NOT WINRT) OR ( UNIX AND NOT APPLE AND NOT ANDROID)
+  VERIFY HAVE_OBSENSOR)
 
 # OpenCV build components
 # ===================================================
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index 377029f45f..3261b9d084 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -235,6 +235,30 @@ if(TARGET ocv.3rdparty.android_native_camera)
   list(APPEND tgts ocv.3rdparty.android_native_camera)
 endif()
 
+if(TARGET ocv.3rdparty.obsensor)
+  list(APPEND videoio_srcs
+    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.cpp)
+  list(APPEND videoio_hdrs
+    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_interface.hpp)
+  if(HAVE_OBSENSOR_MSMF)
+    list(APPEND videoio_srcs
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.cpp)
+    list(APPEND videoio_hdrs
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.hpp)
+  elseif(HAVE_OBSENSOR_V4L2)
+    list(APPEND videoio_srcs
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp)
+    list(APPEND videoio_hdrs
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp)
+  endif()
+  list(APPEND tgts ocv.3rdparty.obsensor)
+endif()
+
 if(tgts STREQUAL "PRIVATE")
   set(tgts "")
 endif()
diff --git a/modules/videoio/cmake/detect_obsensor.cmake b/modules/videoio/cmake/detect_obsensor.cmake
new file mode 100644
index 0000000000..73eb638b07
--- /dev/null
+++ b/modules/videoio/cmake/detect_obsensor.cmake
@@ -0,0 +1,18 @@
+# --- obsensor ---
+if(NOT HAVE_OBSENSOR)
+  if(WIN32)
+    check_include_file(mfapi.h HAVE_MFAPI)
+    if(HAVE_MFAPI)
+      set(HAVE_OBSENSOR TRUE)
+      set(HAVE_OBSENSOR_MSMF TRUE)
+      ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_MSMF")
+    endif()
+  elseif(UNIX)
+    check_include_file(linux/videodev2.h HAVE_CAMV4L2)
+    if(HAVE_CAMV4L2)
+      set(HAVE_OBSENSOR TRUE)
+      set(HAVE_OBSENSOR_V4L2 TRUE)
+      ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_V4L2")
+    endif()
+  endif()
+endif()
diff --git a/modules/videoio/cmake/init.cmake b/modules/videoio/cmake/init.cmake
index af664f94df..2237b97a09 100644
--- a/modules/videoio/cmake/init.cmake
+++ b/modules/videoio/cmake/init.cmake
@@ -22,6 +22,7 @@ add_backend("realsense" WITH_LIBREALSENSE)
 add_backend("ueye" WITH_UEYE)
 add_backend("ximea" WITH_XIMEA)
 add_backend("xine" WITH_XINE)
+add_backend("obsensor" WITH_OBSENSOR)
 
 add_backend("avfoundation" WITH_AVFOUNDATION)
 add_backend("ios" WITH_CAP_IOS)
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 173c73c454..3a4b5995f1 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -123,6 +123,7 @@ enum VideoCaptureAPIs {
        CAP_INTEL_MFX    = 2300,         //!< Intel MediaSDK
        CAP_XINE         = 2400,         //!< XINE engine (Linux)
        CAP_UEYE         = 2500,         //!< uEye Camera API
+       CAP_OBSENSOR     = 2600,         //!< For Orbbec 3D-Sensor device/module (Astra+, Femto)
      };
 
 /** @brief cv::VideoCapture generic properties identifier.
@@ -654,6 +655,35 @@ enum { CAP_PROP_IMAGES_BASE = 18000,
 
 //! @} Images
 
+/** @name OBSENSOR (for Orbbec 3D-Sensor device/module )
+    @{
+*/
+//! OBSENSOR data given from image generator
+enum VideoCaptureOBSensorDataType{ 
+    CAP_OBSENSOR_DEPTH_MAP = 0, //!< Depth values in mm (CV_16UC1)
+    CAP_OBSENSOR_BGR_IMAGE = 1, //!< Data given from BGR stream generator
+    CAP_OBSENSOR_IR_IMAGE = 2   //!< Data given from IR stream generator(CV_16UC1)
+};
+
+//! OBSENSOR stream generator
+enum VideoCaptureOBSensorGenerators{ 
+    CAP_OBSENSOR_DEPTH_GENERATOR = 1 << 29,
+    CAP_OBSENSOR_IMAGE_GENERATOR = 1 << 28,
+    CAP_OBSENSOR_IR_GENERATOR    = 1 << 27,  
+    CAP_OBSENSOR_GENERATORS_MASK = CAP_OBSENSOR_DEPTH_GENERATOR + CAP_OBSENSOR_IMAGE_GENERATOR + CAP_OBSENSOR_IR_GENERATOR
+};
+
+//!OBSENSOR properties 
+enum VideoCaptureOBSensorProperties{
+    // INTRINSIC
+    CAP_PROP_OBSENSOR_INTRINSIC_FX=26001,
+    CAP_PROP_OBSENSOR_INTRINSIC_FY=26002,
+    CAP_PROP_OBSENSOR_INTRINSIC_CX=26003,
+    CAP_PROP_OBSENSOR_INTRINSIC_CY=26004,
+};
+
+//! @} OBSENSOR
+
 //! @} videoio_flags_others
 
 
diff --git a/modules/videoio/src/cap_interface.hpp b/modules/videoio/src/cap_interface.hpp
index eef515b2e3..ebc07d77b0 100644
--- a/modules/videoio/src/cap_interface.hpp
+++ b/modules/videoio/src/cap_interface.hpp
@@ -411,6 +411,8 @@ Ptr<IVideoCapture> createXINECapture(const std::string &filename);
 Ptr<IVideoCapture> createAndroidCapture_cam( int index );
 Ptr<IVideoCapture> createAndroidCapture_file(const std::string &filename);
 
+Ptr<IVideoCapture> create_obsensor_capture(int index);
+
 bool VideoCapture_V4L_waitAny(
         const std::vector<VideoCapture>& streams,
         CV_OUT std::vector<int>& ready,
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
new file mode 100644
index 0000000000..58e182ac0a
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
@@ -0,0 +1,103 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_INTERFACE_HPP
+#define OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_INTERFACE_HPP
+
+#ifdef HAVE_OBSENSOR
+
+#include "../precomp.hpp" // #include "precomp.hpp : compile error on linux
+
+#include <functional>
+#include <vector>
+#include <memory>
+
+namespace cv {
+namespace obsensor {
+enum StreamType
+{
+    OBSENSOR_STREAM_IR = 1,
+    OBSENSOR_STREAM_COLOR = 2,
+    OBSENSOR_STREAM_DEPTH = 3,
+};
+
+enum FrameFormat
+{
+    FRAME_FORMAT_UNKNOWN = -1,
+    FRAME_FORMAT_YUYV = 0,
+    FRAME_FORMAT_MJPG = 5,
+    FRAME_FORMAT_Y16 = 8,
+};
+
+enum PropertyId
+{
+    DEPTH_TO_COLOR_ALIGN = 42,
+    CAMERA_PARAM = 1001,
+};
+
+struct Frame
+{
+    FrameFormat format;
+    uint32_t width;
+    uint32_t height;
+    uint32_t dataSize;
+    uint8_t* data;
+};
+
+struct StreamProfile
+{
+    uint32_t width;
+    uint32_t height;
+    uint32_t fps;
+    FrameFormat format;
+};
+
+struct CameraParam
+{
+    float    p0[4];
+    float    p1[4];
+    float    p2[9];
+    float    p3[3];
+    float    p4[5];
+    float    p5[5];
+    uint32_t p6[2];
+    uint32_t p7[2];
+};
+
+typedef std::function<void(Frame*)> FrameCallback;
+class IStreamChannel
+{
+public:
+    virtual ~IStreamChannel() noexcept {}
+    virtual void start(const StreamProfile& profile, FrameCallback frameCallback) = 0;
+    virtual void stop() = 0;
+    virtual bool setProperty(int propId, const uint8_t* data, uint32_t dataSize) = 0;
+    virtual bool getProperty(int propId, uint8_t* recvData, uint32_t* recvDataSize) = 0;
+
+    virtual StreamType streamType() const = 0;
+};
+
+// "StreamChannelGroup" mean a group of stream channels from same one physical device
+std::vector<Ptr<IStreamChannel>> getStreamChannelGroup(uint32_t groupIdx = 0);
+
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR
+#endif // OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_INTERFACE_HPP
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
new file mode 100644
index 0000000000..b374424050
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
@@ -0,0 +1,505 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+
+#ifdef HAVE_OBSENSOR_MSMF
+
+#include "obsensor_stream_channel_msmf.hpp"
+
+#include <shlwapi.h> // QISearch
+
+#pragma warning(disable : 4503)
+#pragma comment(lib, "mfplat")
+#pragma comment(lib, "mf")
+#pragma comment(lib, "mfuuid")
+#pragma comment(lib, "Strmiids")
+#pragma comment(lib, "Mfreadwrite")
+#pragma comment(lib, "dxgi")
+
+namespace cv {
+namespace obsensor {
+std::string wideCharToUTF8(const WCHAR* s)
+{
+    auto len = WideCharToMultiByte(CP_UTF8, 0, s, -1, nullptr, 0, nullptr, nullptr);
+    if (len == 0)
+        return "";
+    std::string buffer(len - 1, ' ');
+    len = WideCharToMultiByte(CP_UTF8, 0, s, -1, &buffer[0], static_cast<int>(buffer.size()) + 1, nullptr, nullptr);
+    return buffer;
+}
+
+std::string hr_to_string(HRESULT hr)
+{
+    _com_error err(hr);
+    std::stringstream ss;
+    ss << "HResult 0x" << std::hex << hr << ": \"" << err.ErrorMessage() << "\"";
+    return ss.str();
+}
+
+#define HR_FAILED_RETURN(x)                                                      \
+    if (x < 0)                                                                   \
+    {                                                                            \
+        CV_LOG_INFO(NULL, "Media Foundation error return: " << hr_to_string(x)); \
+        return;                                                                  \
+    }
+
+#define HR_FAILED_LOG(x)                                                         \
+    if (x < 0)                                                                   \
+    {                                                                            \
+        CV_LOG_INFO(NULL, "Media Foundation error return: " << hr_to_string(x)); \
+    }
+
+#define HR_FAILED_EXEC(x, statement)                                             \
+    if (x < 0)                                                                   \
+    {                                                                            \
+        CV_LOG_INFO(NULL, "Media Foundation error return: " << hr_to_string(x)); \
+        statement;                                                               \
+    }
+
+std::vector<std::string> stringSplit(std::string string, char separator)
+{
+    std::vector<std::string> tokens;
+    std::string::size_type i1 = 0;
+    while (true)
+    {
+        auto i2 = string.find(separator, i1);
+        if (i2 == std::string::npos)
+        {
+            tokens.push_back(string.substr(i1));
+            return tokens;
+        }
+        tokens.push_back(string.substr(i1, i2 - i1));
+        i1 = i2 + 1;
+    }
+}
+
+bool parseUvcDeviceSymbolicLink(const std::string& symbolicLink, uint16_t& vid, uint16_t& pid, uint16_t& mi, std::string& unique_id,
+    std::string& device_guid)
+{
+    std::string lowerStr = symbolicLink;
+    for (size_t i = 0; i < lowerStr.length(); i++)
+    {
+        lowerStr[i] = (char)tolower(lowerStr[i]); 
+    }
+    auto tokens = stringSplit(lowerStr, '#');
+    if (tokens.size() < 1 || (tokens[0] != R"(\\?\usb)" && tokens[0] != R"(\\?\hid)"))
+        return false; // Not a USB device
+    if (tokens.size() < 3)
+    {
+        return false;
+    }
+
+    auto ids = stringSplit(tokens[1], '&');
+    if (ids[0].size() != 8 || ids[0].substr(0, 4) != "vid_" || !(std::istringstream(ids[0].substr(4, 4)) >> std::hex >> vid))
+    {
+        return false;
+    }
+    if (ids[1].size() != 8 || ids[1].substr(0, 4) != "pid_" || !(std::istringstream(ids[1].substr(4, 4)) >> std::hex >> pid))
+    {
+        return false;
+    }
+    if (ids.size() > 2 && (ids[2].size() != 5 || ids[2].substr(0, 3) != "mi_" || !(std::istringstream(ids[2].substr(3, 2)) >> mi)))
+    {
+        return false;
+    }
+    ids = stringSplit(tokens[2], '&');
+    if (ids.size() == 0)
+    {
+        return false;
+    }
+
+    if (ids.size() > 2)
+        unique_id = ids[1];
+    else
+        unique_id = "";
+
+    if (tokens.size() >= 3)
+        device_guid = tokens[3];
+
+    return true;
+}
+
+#pragma pack(push, 1)
+template <class T>
+class big_endian
+{
+    T be_value;
+
+public:
+    operator T() const
+    {
+        T le_value = 0;
+        for (unsigned int i = 0; i < sizeof(T); ++i)
+            reinterpret_cast<char*>(&le_value)[i] = reinterpret_cast<const char*>(&be_value)[sizeof(T) - i - 1];
+        return le_value;
+    }
+};
+#pragma pack(pop)
+
+MFContext::~MFContext(void)
+{
+    CoUninitialize();
+}
+
+MFContext& MFContext::getInstance()
+{
+    static MFContext instance;
+    return instance;
+}
+
+std::vector<UvcDeviceInfo> MFContext::queryUvcDeviceInfoList()
+{
+    std::vector<UvcDeviceInfo> uvcDevList;
+    ComPtr<IMFAttributes> pAttributes = nullptr;
+    MFCreateAttributes(&pAttributes, 1);
+    pAttributes->SetGUID(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID);
+    // pAttributes->SetGUID(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_CATEGORY, KSCATEGORY_SENSOR_CAMERA);
+    IMFActivate** ppDevices;
+    uint32_t numDevices;
+    MFEnumDeviceSources(pAttributes.Get(), &ppDevices, &numDevices);
+    for (uint32_t i = 0; i < numDevices; ++i)
+    {
+        ComPtr<IMFActivate> pDevice;
+        pDevice = ppDevices[i];
+
+        WCHAR* wCharStr = nullptr;
+        uint32_t length;
+        pDevice->GetAllocatedString(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_SYMBOLIC_LINK, &wCharStr, &length);
+        auto symbolicLink = wideCharToUTF8(wCharStr);
+        CoTaskMemFree(wCharStr);
+
+        pDevice->GetAllocatedString(MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME, &wCharStr, &length);
+        auto name = wideCharToUTF8(wCharStr);
+        CoTaskMemFree(wCharStr);
+
+        uint16_t vid, pid, mi;
+        std::string uid, guid;
+        if (!parseUvcDeviceSymbolicLink(symbolicLink, vid, pid, mi, uid, guid))
+            continue;
+        uvcDevList.emplace_back(UvcDeviceInfo({ symbolicLink, name, uid, vid, pid, mi }));
+        CV_LOG_INFO(NULL, "UVC device found: name=" << name << ", vid=" << vid << ", pid=" << pid << ", mi=" << mi << ", uid=" << uid << ", guid=" << guid);
+    }
+    return uvcDevList;
+}
+
+Ptr<IStreamChannel> MFContext::createStreamChannel(const UvcDeviceInfo& devInfo)
+{
+    return makePtr<MSMFStreamChannel>(devInfo);
+}
+
+MFContext::MFContext()
+{
+    CoInitialize(0);
+    CV_Assert(SUCCEEDED(MFStartup(MF_VERSION)));
+}
+
+MSMFStreamChannel::MSMFStreamChannel(const UvcDeviceInfo& devInfo) :
+    IUvcStreamChannel(devInfo),
+    mfContext_(MFContext::getInstance()),
+    xuNodeId_(-1)
+{
+    HR_FAILED_RETURN(MFCreateAttributes(&deviceAttrs_, 2));
+    HR_FAILED_RETURN(deviceAttrs_->SetGUID(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID));
+    WCHAR* buffer = new wchar_t[devInfo_.id.length() + 1];
+    MultiByteToWideChar(CP_UTF8, 0, devInfo_.id.c_str(), (int)devInfo_.id.length() + 1, buffer, (int)devInfo_.id.length() * sizeof(WCHAR));
+    HR_FAILED_EXEC(deviceAttrs_->SetString(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_SYMBOLIC_LINK, buffer), {
+        delete[] buffer;
+        return;
+        })
+        delete[] buffer;
+        HR_FAILED_RETURN(MFCreateDeviceSource(deviceAttrs_.Get(), &deviceSource_));
+        HR_FAILED_RETURN(deviceSource_->QueryInterface(__uuidof(IAMCameraControl), reinterpret_cast<void**>(&cameraControl_)));
+        HR_FAILED_RETURN(deviceSource_->QueryInterface(__uuidof(IAMVideoProcAmp), reinterpret_cast<void**>(&videoProcAmp_)));
+
+        HR_FAILED_RETURN(MFCreateAttributes(&readerAttrs_, 3));
+        HR_FAILED_RETURN(readerAttrs_->SetUINT32(MF_SOURCE_READER_DISCONNECT_MEDIASOURCE_ON_SHUTDOWN, false));
+        HR_FAILED_RETURN(readerAttrs_->SetUINT32(MF_READWRITE_ENABLE_HARDWARE_TRANSFORMS, true));
+        HR_FAILED_RETURN(readerAttrs_->SetUnknown(MF_SOURCE_READER_ASYNC_CALLBACK, static_cast<IUnknown*>(this)));
+        HR_FAILED_RETURN(MFCreateSourceReaderFromMediaSource(deviceSource_.Get(), readerAttrs_.Get(), &streamReader_));
+        HR_FAILED_RETURN(streamReader_->SetStreamSelection(static_cast<DWORD>(MF_SOURCE_READER_ALL_STREAMS), true));
+
+        HR_FAILED_RETURN(deviceSource_->QueryInterface(__uuidof(IKsTopologyInfo), reinterpret_cast<void**>(&xuKsTopologyInfo_)));
+        DWORD nNodes = 0;
+        HR_FAILED_RETURN(xuKsTopologyInfo_->get_NumNodes(&nNodes));
+        for (DWORD i = 0; i < nNodes; i++)
+        {
+            GUID nodeType;
+            HR_FAILED_EXEC(xuKsTopologyInfo_->get_NodeType(i, &nodeType), { continue; })
+                if (nodeType == KSNODETYPE_DEV_SPECIFIC)
+                {
+                    xuNodeId_ = i;
+                }
+        }
+        if (xuNodeId_ != -1)
+        {
+            HR_FAILED_RETURN(xuKsTopologyInfo_->CreateNodeInstance(xuNodeId_, IID_IUnknown, reinterpret_cast<LPVOID*>(&xuNodeInstance_)));
+            HR_FAILED_RETURN(xuNodeInstance_->QueryInterface(__uuidof(IKsControl), reinterpret_cast<void**>(&xuKsControl_)));
+        }
+
+        if (streamType_ == OBSENSOR_STREAM_DEPTH)
+        {
+            initDepthFrameProcessor();
+        }
+}
+
+MSMFStreamChannel::~MSMFStreamChannel()
+{
+    stop();
+    if (cameraControl_)
+    {
+        cameraControl_.Release();
+    }
+    if (videoProcAmp_)
+    {
+        videoProcAmp_.Release();
+    }
+    if (streamReader_)
+    {
+        streamReader_.Release();
+    }
+    if (readerAttrs_)
+    {
+        readerAttrs_.Release();
+    }
+    if (deviceAttrs_)
+    {
+        deviceAttrs_.Release();
+    }
+    if (deviceSource_)
+    {
+        deviceSource_.Release();
+    }
+    if (xuKsTopologyInfo_)
+    {
+        xuKsTopologyInfo_.Release();
+    }
+    if (xuNodeInstance_)
+    {
+        xuNodeInstance_.Release();
+    }
+    if (xuKsControl_)
+    {
+        xuKsControl_.Release();
+    }
+}
+
+void MSMFStreamChannel::start(const StreamProfile& profile, FrameCallback frameCallback)
+{
+    ComPtr<IMFMediaType> mediaType = nullptr;
+    unsigned int width, height, fps;
+    FrameRate frameRateMin, frameRateMax;
+    bool quit = false;
+
+    frameCallback_ = frameCallback;
+    currentProfile_ = profile;
+    currentStreamIndex_ = -1;
+
+    for (uint8_t index = 0; index <= 5; index++)
+    {
+        for (uint32_t k = 0;; k++)
+        {
+            HR_FAILED_EXEC(streamReader_->GetNativeMediaType(index, k, &mediaType), { continue; })
+                GUID subtype;
+            HR_FAILED_RETURN(mediaType->GetGUID(MF_MT_SUBTYPE, &subtype));
+            HR_FAILED_RETURN(MFGetAttributeSize(mediaType.Get(), MF_MT_FRAME_SIZE, &width, &height));
+            HR_FAILED_RETURN(MFGetAttributeRatio(mediaType.Get(), MF_MT_FRAME_RATE_RANGE_MIN, &frameRateMin.numerator, &frameRateMin.denominator));
+            HR_FAILED_RETURN(MFGetAttributeRatio(mediaType.Get(), MF_MT_FRAME_RATE_RANGE_MAX, &frameRateMax.numerator, &frameRateMax.denominator));
+
+            if (static_cast<float>(frameRateMax.numerator) / frameRateMax.denominator < static_cast<float>(frameRateMin.numerator) / frameRateMin.denominator)
+            {
+                std::swap(frameRateMax, frameRateMin);
+            }
+
+            fps = frameRateMax.numerator / frameRateMax.denominator;
+            uint32_t device_fourcc = reinterpret_cast<const big_endian<uint32_t> &>(subtype.Data1);
+            if (width == profile.width &&
+                height == profile.height &&
+                fps == profile.fps &&
+                frameFourccToFormat(device_fourcc) == profile.format)
+            {
+                HR_FAILED_RETURN(streamReader_->SetCurrentMediaType(index, nullptr, mediaType.Get()));
+                HR_FAILED_RETURN(streamReader_->SetStreamSelection(index, true));
+                streamReader_->ReadSample(index, 0, nullptr, nullptr, nullptr, nullptr);
+
+                streamState_ = STREAM_STARTING;
+                currentStreamIndex_ = index;
+                quit = true;
+
+                // wait for frame
+                std::unique_lock<std::mutex> lock(streamStateMutex_);
+                auto success = streamStateCv_.wait_for(lock, std::chrono::milliseconds(3000), [&]() {
+                    return streamState_ == STREAM_STARTED;
+                });
+                if (!success)
+                {
+                    stop();
+                }
+                break;
+            }
+            mediaType.Release();
+        }
+        if (quit)
+        {
+            break;
+        }
+    }
+    streamState_ = quit ? streamState_ : STREAM_STOPED;
+}
+
+void MSMFStreamChannel::stop()
+{
+    if (streamState_ == STREAM_STARTING || streamState_ == STREAM_STARTED)
+    {
+        streamState_ = STREAM_STOPPING;
+        streamReader_->SetStreamSelection(currentStreamIndex_, false);
+        streamReader_->Flush(currentStreamIndex_);
+        std::unique_lock<std::mutex> lk(streamStateMutex_);
+        streamStateCv_.wait_for(lk, std::chrono::milliseconds(1000), [&]() {
+            return streamState_ == STREAM_STOPED;
+        });
+    }
+}
+
+bool  MSMFStreamChannel::setXu(uint8_t ctrl, const uint8_t* data, uint32_t len)
+{
+    if (xuSendBuf_.size() < XU_MAX_DATA_LENGTH) {
+        xuSendBuf_.resize(XU_MAX_DATA_LENGTH);
+    }
+    memcpy(xuSendBuf_.data(), data, len);
+
+    KSP_NODE                              node;
+    memset(&node, 0, sizeof(KSP_NODE));
+    node.Property.Set = { 0xA55751A1, 0xF3C5, 0x4A5E, {0x8D, 0x5A, 0x68, 0x54, 0xB8, 0xFA, 0x27, 0x16} };
+    node.Property.Id = ctrl;
+    node.Property.Flags = KSPROPERTY_TYPE_SET | KSPROPERTY_TYPE_TOPOLOGY;
+    node.NodeId = xuNodeId_;
+
+    ULONG bytes_received = 0;
+    HR_FAILED_EXEC(xuKsControl_->KsProperty(reinterpret_cast<PKSPROPERTY>(&node), sizeof(KSP_NODE), (void*)xuSendBuf_.data(), XU_MAX_DATA_LENGTH, &bytes_received), {
+        return false;
+    });
+    return true;
+}
+
+bool  MSMFStreamChannel::getXu(uint8_t ctrl, uint8_t** data, uint32_t* len)
+{
+    if (xuRecvBuf_.size() < XU_MAX_DATA_LENGTH) {
+        xuRecvBuf_.resize(XU_MAX_DATA_LENGTH);
+    }
+    KSP_NODE node;
+    memset(&node, 0, sizeof(KSP_NODE));
+    node.Property.Set = { 0xA55751A1, 0xF3C5, 0x4A5E, {0x8D, 0x5A, 0x68, 0x54, 0xB8, 0xFA, 0x27, 0x16} };
+    node.Property.Id = ctrl;
+    node.Property.Flags = KSPROPERTY_TYPE_GET | KSPROPERTY_TYPE_TOPOLOGY;
+    node.NodeId = xuNodeId_;
+
+    ULONG bytes_received = 0;
+    HR_FAILED_EXEC(xuKsControl_->KsProperty(reinterpret_cast<PKSPROPERTY>(&node), sizeof(node), xuRecvBuf_.data(), XU_MAX_DATA_LENGTH, &bytes_received), {
+        *len = 0;
+        data = nullptr;
+        return false;
+    });
+    *data = xuRecvBuf_.data();
+    *len = bytes_received;
+    return true;
+}
+
+STDMETHODIMP MSMFStreamChannel::QueryInterface(REFIID iid, void** ppv)
+{
+#pragma warning(push)
+#pragma warning(disable : 4838)
+    static const QITAB qit[] = {
+        QITABENT(MSMFStreamChannel, IMFSourceReaderCallback),
+        {nullptr},
+    };
+    return QISearch(this, qit, iid, ppv);
+#pragma warning(pop)
+}
+
+STDMETHODIMP_(ULONG)
+MSMFStreamChannel::AddRef()
+{
+    return InterlockedIncrement(&refCount_);
+}
+
+STDMETHODIMP_(ULONG)
+MSMFStreamChannel::Release()
+{
+    ULONG count = InterlockedDecrement(&refCount_);
+    if (count <= 0)
+    {
+        delete this;
+    }
+    return count;
+}
+
+STDMETHODIMP MSMFStreamChannel::OnReadSample(HRESULT hrStatus, DWORD dwStreamIndex, DWORD /*dwStreamFlags*/, LONGLONG /*timeStamp*/, IMFSample* sample)
+{
+    HR_FAILED_LOG(hrStatus);
+
+    if (streamState_ == STREAM_STARTING)
+    {
+        std::unique_lock<std::mutex> lock(streamStateMutex_);
+        streamState_ = STREAM_STARTED;
+        streamStateCv_.notify_all();
+    }
+
+    if (streamState_ != STREAM_STOPPING && streamState_ != STREAM_STOPED)
+    {
+        HR_FAILED_LOG(streamReader_->ReadSample(dwStreamIndex, 0, nullptr, nullptr, nullptr, nullptr));
+        if (sample)
+        {
+            ComPtr<IMFMediaBuffer> buffer = nullptr;
+            DWORD max_length, current_length;
+            byte* byte_buffer = nullptr;
+
+            HR_FAILED_EXEC(sample->GetBufferByIndex(0, &buffer), { return S_OK; });
+
+            buffer->Lock(&byte_buffer, &max_length, &current_length);
+            Frame fo = { currentProfile_.format, currentProfile_.width, currentProfile_.height, current_length, (uint8_t*)byte_buffer };
+            if (depthFrameProcessor_)
+            {
+                depthFrameProcessor_->process(&fo);
+            }
+            frameCallback_(&fo);
+            buffer->Unlock();
+        }
+    }
+    return S_OK;
+}
+
+STDMETHODIMP MSMFStreamChannel::OnEvent(DWORD /*sidx*/, IMFMediaEvent* /*event*/)
+{
+    return S_OK;
+}
+
+STDMETHODIMP MSMFStreamChannel::OnFlush(DWORD)
+{
+    if (streamState_ == STREAM_STARTING)
+    {
+        std::unique_lock<std::mutex> lock(streamStateMutex_);
+        streamState_ = STREAM_STOPED;
+        streamStateCv_.notify_all();
+    }
+    return S_OK;
+}
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR_MSMF
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
new file mode 100644
index 0000000000..aa40fe53be
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_MSMF_HPP
+#define OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_MSMF_HPP
+#ifdef HAVE_OBSENSOR_MSMF
+
+#include "obsensor_uvc_stream_channel.hpp"
+
+#include <windows.h>
+#include <guiddef.h>
+#include <mfapi.h>
+#include <mfidl.h>
+#include <mfplay.h>
+#include <mfobjects.h>
+#include <mfreadwrite.h>
+#include <tchar.h>
+#include <strsafe.h>
+#include <codecvt>
+#include <ks.h>
+#include <comdef.h>
+#include <mutex>
+#include <vidcap.h> //IKsTopologyInfo
+#include <ksproxy.h> //IKsControl
+#include <ksmedia.h> 
+
+namespace cv {
+namespace obsensor {
+template <class T>
+class ComPtr
+{
+public:
+    ComPtr() {}
+    ComPtr(T* lp)
+    {
+        p = lp;
+    }
+    ComPtr(_In_ const ComPtr<T>& lp)
+    {
+        p = lp.p;
+    }
+    virtual ~ComPtr() {}
+
+    void swap(_In_ ComPtr<T>& lp)
+    {
+        ComPtr<T> tmp(p);
+        p = lp.p;
+        lp.p = tmp.p;
+        tmp = NULL;
+    }
+    T** operator&()
+    {
+        CV_Assert(p == NULL);
+        return p.operator&();
+    }
+    T* operator->() const
+    {
+        CV_Assert(p != NULL);
+        return p.operator->();
+    }
+    operator bool()
+    {
+        return p.operator!=(NULL);
+    }
+
+    T* Get() const
+    {
+        return p;
+    }
+
+    void Release()
+    {
+        if (p)
+            p.Release();
+    }
+
+    // query for U interface
+    template <typename U>
+    HRESULT As(_Out_ ComPtr<U>& lp) const
+    {
+        lp.Release();
+        return p->QueryInterface(__uuidof(U), reinterpret_cast<void**>((T**)&lp));
+    }
+
+private:
+    _COM_SMARTPTR_TYPEDEF(T, __uuidof(T));
+    TPtr p;
+};
+
+class MFContext
+{
+public:
+    ~MFContext(void);
+    static MFContext& getInstance();
+
+    std::vector<UvcDeviceInfo> queryUvcDeviceInfoList();
+    Ptr<IStreamChannel> createStreamChannel(const UvcDeviceInfo& devInfo);
+
+private:
+    MFContext(void);
+};
+
+struct FrameRate
+{
+    unsigned int denominator;
+    unsigned int numerator;
+};
+
+class MSMFStreamChannel : public IUvcStreamChannel, public IMFSourceReaderCallback
+{
+public:
+    MSMFStreamChannel(const UvcDeviceInfo& devInfo);
+    virtual ~MSMFStreamChannel() noexcept;
+
+    virtual void start(const StreamProfile& profile, FrameCallback frameCallback) override;
+    virtual void stop() override;
+
+private:
+    virtual bool setXu(uint8_t ctrl, const uint8_t* data, uint32_t len) override;
+    virtual bool getXu(uint8_t ctrl, uint8_t** data, uint32_t* len) override;
+
+private:
+    MFContext& mfContext_;
+
+    ComPtr<IMFAttributes> deviceAttrs_ = nullptr;
+    ComPtr<IMFMediaSource> deviceSource_ = nullptr;
+    ComPtr<IMFAttributes> readerAttrs_ = nullptr;
+    ComPtr<IMFSourceReader> streamReader_ = nullptr;
+    ComPtr<IAMCameraControl> cameraControl_ = nullptr;
+    ComPtr<IAMVideoProcAmp> videoProcAmp_ = nullptr;
+    ComPtr<IKsTopologyInfo> xuKsTopologyInfo_ = nullptr;
+    ComPtr<IUnknown> xuNodeInstance_ = nullptr;
+    ComPtr<IKsControl> xuKsControl_ = nullptr;
+    int xuNodeId_;
+
+    FrameCallback frameCallback_;
+    StreamProfile currentProfile_;
+    int8_t currentStreamIndex_;
+
+    StreamState streamState_;
+    std::mutex streamStateMutex_;
+    std::condition_variable streamStateCv_;
+
+    std::vector<uint8_t> xuRecvBuf_;
+    std::vector<uint8_t> xuSendBuf_;
+
+public:
+    STDMETHODIMP QueryInterface(REFIID iid, void** ppv) override;
+    STDMETHODIMP_(ULONG)
+        AddRef() override;
+    STDMETHODIMP_(ULONG)
+        Release() override;
+    STDMETHODIMP OnReadSample(HRESULT /*hrStatus*/, DWORD dwStreamIndex, DWORD /*dwStreamFlags*/, LONGLONG /*llTimestamp*/, IMFSample* sample) override;
+    STDMETHODIMP OnEvent(DWORD /*sidx*/, IMFMediaEvent* /*event*/) override;
+    STDMETHODIMP OnFlush(DWORD) override;
+
+private:
+    long refCount_ = 1;
+};
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR_MSMF
+#endif // OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_MSMF_HPP
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
new file mode 100644
index 0000000000..e947be41d7
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
@@ -0,0 +1,379 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifdef HAVE_OBSENSOR_V4L2
+#include "obsensor_stream_channel_v4l2.hpp"
+
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <linux/videodev2.h>
+#include <linux/uvcvideo.h>
+#include <linux/usb/video.h>
+#include <fstream>
+#include <map>
+#include <vector>
+
+#include "opencv2/core/utils/filesystem.hpp"
+
+namespace cv {
+namespace obsensor {
+
+#define IOCTL_FAILED_RETURN(x)                                 \
+    if (x < 0)                                                 \
+    {                                                          \
+        CV_LOG_WARNING(NULL, "ioctl error return: " << errno); \
+        return;                                                \
+    }
+
+#define IOCTL_FAILED_LOG(x)                                    \
+    if (x < 0)                                                 \
+    {                                                          \
+        CV_LOG_WARNING(NULL, "ioctl error return: " << errno); \
+    }
+
+#define IOCTL_FAILED_CONTINUE(x)                               \
+    if (x < 0)                                                 \
+    {                                                          \
+        CV_LOG_WARNING(NULL, "ioctl error return: " << errno); \
+        continue;                                              \
+    }
+
+#define IOCTL_FAILED_EXEC(x, statement)                        \
+    if (x < 0)                                                 \
+    {                                                          \
+        CV_LOG_WARNING(NULL, "ioctl error return: " << errno); \
+        statement;                                             \
+    }
+
+int xioctl(int fd, int req, void* arg)
+{
+    int rst;
+    int retry = 5;
+    do
+    {
+        rst = ioctl(fd, req, arg);
+        retry--;
+    } while (rst == -1 && (errno == EAGAIN || (errno == EBUSY && retry > 0)));
+
+    if (rst < 0)
+    {
+        CV_LOG_WARNING(NULL, "ioctl: fd=" << fd << ", req=" << req);
+    }
+    return rst;
+}
+
+V4L2Context& V4L2Context::getInstance()
+{
+    static V4L2Context instance;
+    return instance;
+}
+
+std::vector<UvcDeviceInfo> V4L2Context::queryUvcDeviceInfoList()
+{
+    std::vector<UvcDeviceInfo> uvcDevList;
+    std::map<std::string, UvcDeviceInfo> uvcDevMap;
+    const cv::String videosDir = "/sys/class/video4linux";
+    cv::utils::Paths videos;
+    if (cv::utils::fs::isDirectory(videosDir))
+    {
+        cv::utils::fs::glob(videosDir, "*", videos, false, true);
+        for (const auto& video : videos)
+        {
+            UvcDeviceInfo uvcDev;
+            cv::String videoName = video.substr(video.find_last_of("/") + 1);
+            char buf[PATH_MAX];
+            if (realpath(video.c_str(), buf) == nullptr || cv::String(buf).find("virtual") != std::string::npos)
+            {
+                continue;
+            }
+            cv::String videoRealPath = buf;
+            cv::String interfaceRealPath = videoRealPath.substr(0, videoRealPath.find_last_of("/"));
+
+            std::string busNum, devNum, devPath;
+            while (videoRealPath.find_last_of("/") != std::string::npos)
+            {
+                videoRealPath = videoRealPath.substr(0, videoRealPath.find_last_of("/"));
+                if (!(std::ifstream(videoRealPath + "/busnum") >> busNum))
+                {
+                    continue;
+                }
+                if (!(std::ifstream(videoRealPath + "/devpath") >> devPath))
+                {
+                    continue;
+                }
+                if (!(std::ifstream(videoRealPath + "/devnum") >> devNum))
+                {
+                    continue;
+                }
+                uvcDev.uid = busNum + "-" + devPath + "-" + devNum;
+                break;
+                /* code */
+            }
+
+            uvcDev.id = cv::String("/dev/") + videoName;
+            v4l2_capability caps = {};
+            int videoFd = open(uvcDev.id.c_str(), O_RDONLY);
+            IOCTL_FAILED_EXEC(xioctl(videoFd, VIDIOC_QUERYCAP, &caps), {
+                close(videoFd);
+                continue;
+            });
+            close(videoFd);
+
+            if (caps.capabilities & V4L2_CAP_VIDEO_CAPTURE)
+            {
+                cv::String modalias;
+                if (!(std::ifstream(video + "/device/modalias") >> modalias) ||
+                    modalias.size() < 14 ||
+                    modalias.substr(0, 5) != "usb:v" ||
+                    modalias[9] != 'p')
+                {
+                    continue;
+                }
+                std::istringstream(modalias.substr(5, 4)) >> std::hex >> uvcDev.vid;
+                std::istringstream(modalias.substr(10, 4)) >> std::hex >> uvcDev.pid;
+                std::getline(std::ifstream(video + "/device/interface"), uvcDev.name);
+                std::ifstream(video + "/device/bInterfaceNumber") >> uvcDev.mi;
+                uvcDevMap.insert({ interfaceRealPath, uvcDev });
+            }
+        }
+    }
+    for (const auto& item : uvcDevMap)
+    {
+        const auto uvcDev = item.second; // alias
+        CV_LOG_INFO(NULL, "UVC device found: name=" << uvcDev.name << ", vid=" << uvcDev.vid << ", pid=" << uvcDev.pid << ", mi=" << uvcDev.mi << ", uid=" << uvcDev.uid << ", id=" << uvcDev.id);
+        uvcDevList.push_back(uvcDev);
+    }
+    return uvcDevList;
+}
+
+Ptr<IStreamChannel> V4L2Context::createStreamChannel(const UvcDeviceInfo& devInfo)
+{
+    return makePtr<V4L2StreamChannel>(devInfo);
+}
+
+V4L2StreamChannel::V4L2StreamChannel(const UvcDeviceInfo &devInfo) : IUvcStreamChannel(devInfo),
+                                                                     devFd_(-1),
+                                                                     streamState_(STREAM_STOPED)
+{
+
+    devFd_ = open(devInfo_.id.c_str(), O_RDWR | O_NONBLOCK, 0);
+    if (devFd_ < 0)
+    {
+        CV_LOG_ERROR(NULL, "Open " << devInfo_.id << " failed ! errno=" << errno)
+    }
+    else if (streamType_ == OBSENSOR_STREAM_DEPTH)
+    {
+        initDepthFrameProcessor();
+    }
+
+}
+
+V4L2StreamChannel::~V4L2StreamChannel() noexcept
+{
+    stop();
+    if (devFd_)
+    {
+        close(devFd_);
+        devFd_ = -1;
+    }
+}
+
+void V4L2StreamChannel::start(const StreamProfile& profile, FrameCallback frameCallback)
+{
+    if (streamState_ != STREAM_STOPED)
+    {
+        CV_LOG_ERROR(NULL, devInfo_.id << ": repetitive operation!")
+            return;
+    }
+    frameCallback_ = frameCallback;
+    currentProfile_ = profile;
+
+    struct v4l2_format fmt = {};
+    fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    fmt.fmt.pix.width = profile.width;
+    fmt.fmt.pix.height = profile.height;
+    fmt.fmt.pix.pixelformat = frameFormatToFourcc(profile.format);
+    IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_S_FMT, &fmt));
+    IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_G_FMT, &fmt));
+
+    struct v4l2_streamparm streamParm = {};
+    streamParm.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    streamParm.parm.capture.timeperframe.numerator = 1;
+    streamParm.parm.capture.timeperframe.denominator = profile.fps;
+    IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_S_PARM, &streamParm));
+    IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_G_PARM, &streamParm));
+
+    struct v4l2_requestbuffers req = {};
+    req.count = MAX_FRAME_BUFFER_NUM;
+    req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    req.memory = V4L2_MEMORY_MMAP;
+    IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_REQBUFS, &req));
+
+    for (uint32_t i = 0; i < req.count && i < MAX_FRAME_BUFFER_NUM; i++)
+    {
+        struct v4l2_buffer buf = {};
+        buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        buf.memory = V4L2_MEMORY_MMAP;
+        buf.index = i; // only one buffer
+        IOCTL_FAILED_RETURN(xioctl(devFd_, VIDIOC_QUERYBUF, &buf));
+        frameBuffList[i].ptr = (uint8_t*)mmap(NULL, buf.length, PROT_READ | PROT_WRITE, MAP_SHARED, devFd_, buf.m.offset);
+        frameBuffList[i].length = buf.length;
+    }
+
+    // stream on
+    std::unique_lock<std::mutex> lk(streamStateMutex_);
+    streamState_ = STREAM_STARTING;
+    uint32_t type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    IOCTL_FAILED_EXEC(xioctl(devFd_, VIDIOC_STREAMON, &type), {
+        streamState_ = STREAM_STOPED;
+        for (uint32_t i = 0; i < MAX_FRAME_BUFFER_NUM; i++)
+        {
+            if (frameBuffList[i].ptr)
+            {
+                munmap(frameBuffList[i].ptr, frameBuffList[i].length);
+                frameBuffList[i].ptr = nullptr;
+                frameBuffList[i].length = 0;
+            }
+        }
+        return;
+    });
+    grabFrameThread_ = std::thread(&V4L2StreamChannel::grabFrame, this);
+}
+
+void V4L2StreamChannel::grabFrame()
+{
+    fd_set fds;
+    FD_ZERO(&fds);
+    FD_SET(devFd_, &fds);
+    struct timeval tv = {};
+    tv.tv_sec = 0;
+    tv.tv_usec = 100000; // 100ms
+
+    struct v4l2_buffer buf = {};
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_MMAP;
+    // buf.index = 0;
+
+    IOCTL_FAILED_EXEC(xioctl(devFd_, VIDIOC_QBUF, &buf), {
+        std::unique_lock<std::mutex> lk(streamStateMutex_);
+        streamState_ = STREAM_STOPED;
+        streamStateCv_.notify_all();
+        return;
+    });
+
+    while (streamState_ == STREAM_STARTING || streamState_ == STREAM_STARTED)
+    {
+        IOCTL_FAILED_CONTINUE(select(devFd_ + 1, &fds, NULL, NULL, &tv));
+        IOCTL_FAILED_CONTINUE(xioctl(devFd_, VIDIOC_DQBUF, &buf));
+        if (streamState_ == STREAM_STARTING)
+        {
+            std::unique_lock<std::mutex> lk(streamStateMutex_);
+            streamState_ = STREAM_STARTED;
+            streamStateCv_.notify_all();
+        }
+        Frame fo = { currentProfile_.format, currentProfile_.width, currentProfile_.height, buf.length, frameBuffList[buf.index].ptr };
+        if (depthFrameProcessor_)
+        {
+            depthFrameProcessor_->process(&fo);
+        }
+        frameCallback_(&fo);
+        IOCTL_FAILED_CONTINUE(xioctl(devFd_, VIDIOC_QBUF, &buf));
+    }
+    std::unique_lock<std::mutex> lk(streamStateMutex_);
+    streamState_ = STREAM_STOPED;
+    streamStateCv_.notify_all();
+}
+
+bool V4L2StreamChannel::setXu(uint8_t ctrl, const uint8_t* data, uint32_t len)
+{
+    if (xuSendBuf_.size() < XU_MAX_DATA_LENGTH) {
+        xuSendBuf_.resize(XU_MAX_DATA_LENGTH);
+    }
+    memcpy(xuSendBuf_.data(), data, len);
+    struct uvc_xu_control_query xu_ctrl_query = {
+        .unit = XU_UNIT_ID,
+        .selector = ctrl,
+        .query = UVC_SET_CUR,
+        .size = (__u16)(ctrl == 1 ? 512 : (ctrl == 2 ? 64 : 1024)),
+        .data = xuSendBuf_.data()
+    };
+    if (devFd_ > 0)
+    {
+        IOCTL_FAILED_EXEC(xioctl(devFd_, UVCIOC_CTRL_QUERY, &xu_ctrl_query), { return false; });
+    }
+    return true;
+}
+
+bool V4L2StreamChannel::getXu(uint8_t ctrl, uint8_t** data, uint32_t* len)
+{
+    if (xuRecvBuf_.size() < XU_MAX_DATA_LENGTH) {
+        xuRecvBuf_.resize(XU_MAX_DATA_LENGTH);
+    }
+    struct uvc_xu_control_query xu_ctrl_query = {
+        .unit = XU_UNIT_ID,
+        .selector = ctrl,
+        .query = UVC_GET_CUR,
+        .size = (__u16)(ctrl == 1 ? 512 : (ctrl == 2 ? 64 : 1024)),
+        .data = xuRecvBuf_.data()
+    };
+
+    IOCTL_FAILED_EXEC(xioctl(devFd_, UVCIOC_CTRL_QUERY, &xu_ctrl_query), {
+        *len = 0;
+        return false;
+    });
+
+    *len = xu_ctrl_query.size;
+    *data = xuRecvBuf_.data();
+    return true;
+}
+
+void V4L2StreamChannel::stop()
+{
+    if (streamState_ == STREAM_STARTING || streamState_ == STREAM_STARTED)
+    {
+        streamState_ = STREAM_STOPPING;
+        std::unique_lock<std::mutex> lk(streamStateMutex_);
+        streamStateCv_.wait_for(lk, std::chrono::milliseconds(1000), [&](){
+            return streamState_ == STREAM_STOPED;
+        });
+        uint32_t type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        IOCTL_FAILED_LOG(xioctl(devFd_, VIDIOC_STREAMOFF, &type));
+    }
+    if (grabFrameThread_.joinable())
+    {
+        grabFrameThread_.join();
+    }
+    for (uint32_t i = 0; i < MAX_FRAME_BUFFER_NUM; i++)
+    {
+        if (frameBuffList[i].ptr)
+        {
+            munmap(frameBuffList[i].ptr, frameBuffList[i].length);
+            frameBuffList[i].ptr = nullptr;
+            frameBuffList[i].length = 0;
+        }
+    }
+}
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR_V4L2
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp
new file mode 100644
index 0000000000..efb535a42d
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_V4L2_HPP
+#define OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_V4L2_HPP
+#ifdef HAVE_OBSENSOR_V4L2
+
+#include "obsensor_uvc_stream_channel.hpp"
+
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+
+namespace cv {
+namespace obsensor {
+#define MAX_FRAME_BUFFER_NUM 4
+struct V4L2FrameBuffer
+{
+    uint32_t length = 0;
+    uint8_t* ptr = nullptr;
+};
+
+int xioctl(int fd, int req, void* arg);
+
+class V4L2Context
+{
+public:
+    ~V4L2Context() {}
+    static V4L2Context& getInstance();
+
+    std::vector<UvcDeviceInfo> queryUvcDeviceInfoList();
+    Ptr<IStreamChannel> createStreamChannel(const UvcDeviceInfo& devInfo);
+
+private:
+    V4L2Context() noexcept {}
+};
+
+class V4L2StreamChannel : public IUvcStreamChannel
+{
+public:
+    V4L2StreamChannel(const UvcDeviceInfo& devInfo);
+    virtual ~V4L2StreamChannel() noexcept;
+
+    virtual void start(const StreamProfile& profile, FrameCallback frameCallback) override;
+    virtual void stop() override;
+
+private:
+    void grabFrame();
+
+    virtual bool setXu(uint8_t ctrl, const uint8_t* data, uint32_t len) override;
+    virtual bool getXu(uint8_t ctrl, uint8_t** data, uint32_t* len) override;
+
+private:
+    int devFd_;
+
+    V4L2FrameBuffer frameBuffList[MAX_FRAME_BUFFER_NUM];
+
+    StreamState streamState_;
+    std::mutex streamStateMutex_;
+    std::condition_variable streamStateCv_;
+
+    std::thread grabFrameThread_;
+
+    FrameCallback frameCallback_;
+    StreamProfile currentProfile_;
+
+    std::vector<uint8_t> xuRecvBuf_;
+    std::vector<uint8_t> xuSendBuf_;
+};
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR_V4L2
+#endif // OPENCV_VIDEOIO_OBSENSOR_STREAM_CHANNEL_V4L2_HPP
diff --git a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
new file mode 100644
index 0000000000..ff038f8330
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
@@ -0,0 +1,265 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#if defined(HAVE_OBSENSOR_V4L2) ||  defined(HAVE_OBSENSOR_MSMF)
+
+#include <map>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <iterator>
+
+#if defined(HAVE_OBSENSOR_V4L2)
+#include "obsensor_stream_channel_v4l2.hpp"
+#elif defined(HAVE_OBSENSOR_MSMF)
+#include "obsensor_stream_channel_msmf.hpp"
+#endif // HAVE_OBSENSOR_V4L2
+
+namespace cv {
+namespace obsensor {
+const uint8_t OB_EXT_CMD0[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0x52, 0x00, 0x5B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD1[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0x54, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD2[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0x56, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD3[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0x58, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD4[16] = { 0x47, 0x4d, 0x02, 0x00, 0x03, 0x00, 0x60, 0x00, 0xed, 0x03, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD5[16] = { 0x47, 0x4d, 0x02, 0x00, 0x03, 0x00, 0x62, 0x00, 0xe9, 0x03, 0x00, 0x00 };
+
+#if defined(HAVE_OBSENSOR_V4L2)
+#define fourCc2Int(a, b, c, d) \
+    ((uint32_t)(a) | ((uint32_t)(b) << 8) | ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
+#elif defined(HAVE_OBSENSOR_MSMF)
+#define fourCc2Int(a, b, c, d) \
+    (((uint32_t)(a) <<24) | ((uint32_t)(b) << 16) | ((uint32_t)(c) << 8) | (uint32_t)(d))
+#endif // HAVE_OBSENSOR_V4L2
+
+const std::map<uint32_t, FrameFormat> fourccToOBFormat = {
+    {fourCc2Int('Y', 'U', 'Y', '2'), FRAME_FORMAT_YUYV},
+    {fourCc2Int('M', 'J', 'P', 'G'), FRAME_FORMAT_MJPG},
+    {fourCc2Int('Y', '1', '6', ' '), FRAME_FORMAT_Y16},
+};
+
+StreamType parseUvcDeviceNameToStreamType(const std::string& devName)
+{
+    std::string uvcDevName = devName;
+    for (size_t i = 0; i < uvcDevName.length(); i++)
+    {
+        uvcDevName[i] = (char)tolower(uvcDevName[i]); 
+    }
+    if (uvcDevName.find(" depth") != std::string::npos)
+    {
+        return OBSENSOR_STREAM_DEPTH;
+    }
+    else if (uvcDevName.find(" ir") != std::string::npos)
+    {
+        return OBSENSOR_STREAM_IR;
+    }
+
+    return OBSENSOR_STREAM_COLOR; // else
+}
+
+FrameFormat frameFourccToFormat(uint32_t fourcc)
+{
+    for (const auto& item : fourccToOBFormat)
+    {
+        if (item.first == fourcc)
+        {
+            return item.second;
+        }
+    }
+    return FRAME_FORMAT_UNKNOWN;
+}
+
+uint32_t frameFormatToFourcc(FrameFormat fmt)
+{
+    for (const auto& item : fourccToOBFormat)
+    {
+        if (item.second == fmt)
+        {
+            return item.first;
+        }
+    }
+    return 0;
+}
+
+std::vector<Ptr<IStreamChannel>> getStreamChannelGroup(uint32_t groupIdx)
+{
+    std::vector<Ptr<IStreamChannel>> streamChannelGroup;
+
+#if defined(HAVE_OBSENSOR_V4L2)
+    auto& ctx = V4L2Context::getInstance();
+#elif defined(HAVE_OBSENSOR_MSMF)
+    auto& ctx = MFContext::getInstance();
+#endif // HAVE_OBSENSOR_V4L2
+
+    auto uvcDevInfoList = ctx.queryUvcDeviceInfoList();
+
+    std::map<std::string, std::vector<UvcDeviceInfo>> uvcDevInfoGroupMap;
+
+    auto devInfoIter = uvcDevInfoList.begin();
+    while (devInfoIter != uvcDevInfoList.end())
+    {
+        if (devInfoIter->vid != OBSENSOR_CAM_VID)
+        {
+            devInfoIter = uvcDevInfoList.erase(devInfoIter); // drop it
+            continue;
+        }
+        devInfoIter++;
+    }
+
+    if (!uvcDevInfoList.empty() && uvcDevInfoList.size() <= 3)
+    {
+        uvcDevInfoGroupMap.insert({ "default", uvcDevInfoList });
+    }
+    else {
+        for (auto& devInfo : uvcDevInfoList)
+        {
+            // group by uid
+            uvcDevInfoGroupMap[devInfo.uid].push_back(devInfo); // todo: group by sn
+        }
+    }
+
+    if (uvcDevInfoGroupMap.size() > groupIdx)
+    {
+        auto uvcDevInfoGroupIter = uvcDevInfoGroupMap.begin();
+        std::advance(uvcDevInfoGroupIter, groupIdx);
+        for (const auto& devInfo : uvcDevInfoGroupIter->second)
+        {
+            streamChannelGroup.emplace_back(ctx.createStreamChannel(devInfo));
+        }
+    }
+    else
+    {
+        CV_LOG_ERROR(NULL, "Camera index out of range");
+    }
+    return streamChannelGroup;
+}
+
+DepthFrameProcessor::DepthFrameProcessor(const OBExtensionParam& param) : param_(param)
+{
+    double tempValue = 0;
+    double rstValue = 0;
+    lookUpTable_ = new uint16_t[4096];
+    memset(lookUpTable_, 0, 4096 * 2);
+    for (uint16_t oriValue = 0; oriValue < 4096; oriValue++)
+    {
+        if (oriValue == 0)
+        {
+            continue;
+        }
+        tempValue = 200.375 - (double)oriValue / 8;
+        rstValue = (double)param_.pd / (1 + tempValue * param_.ps / param_.bl) * 10;
+        if ((rstValue >= 40) && (rstValue <= 10000) && rstValue < 65536)
+        {
+            lookUpTable_[oriValue] = (uint16_t)rstValue;
+        }
+    }
+}
+
+DepthFrameProcessor::~DepthFrameProcessor()
+{
+    delete[] lookUpTable_;
+}
+
+void DepthFrameProcessor::process(Frame* frame)
+{
+    uint16_t* data = (uint16_t*)frame->data;
+    for (uint32_t i = 0; i < frame->dataSize / 2; i++)
+    {
+        data[i] = lookUpTable_[data[i] & 0x0fff];
+    }
+}
+
+IUvcStreamChannel::IUvcStreamChannel(const UvcDeviceInfo& devInfo) :
+    devInfo_(devInfo),
+    streamType_(parseUvcDeviceNameToStreamType(devInfo_.name))
+{
+
+}
+
+StreamType IUvcStreamChannel::streamType() const {
+    return streamType_;
+}
+
+bool IUvcStreamChannel::setProperty(int propId, const uint8_t* /*data*/, uint32_t /*dataSize*/)
+{
+    uint8_t* rcvData;
+    uint32_t rcvLen;
+    bool rst = true;
+    switch (propId)
+    {
+    case DEPTH_TO_COLOR_ALIGN:
+        // todo: value filling
+        rst &= setXu(2, OB_EXT_CMD0, sizeof(OB_EXT_CMD0));
+        rst &= getXu(2, &rcvData, &rcvLen);
+        rst &= setXu(2, OB_EXT_CMD1, sizeof(OB_EXT_CMD1));
+        rst &= getXu(2, &rcvData, &rcvLen);
+        rst &= setXu(2, OB_EXT_CMD2, sizeof(OB_EXT_CMD2));
+        rst &= getXu(2, &rcvData, &rcvLen);
+        rst &= setXu(2, OB_EXT_CMD3, sizeof(OB_EXT_CMD3));
+        rst &= getXu(2, &rcvData, &rcvLen);
+        break;
+    default:
+        rst = false;
+        break;
+    }
+    return rst;
+}
+
+bool IUvcStreamChannel::getProperty(int propId, uint8_t* recvData, uint32_t* recvDataSize)
+{
+    bool rst = true;
+    uint8_t* rcvData;
+    uint32_t rcvLen;
+    switch (propId)
+    {
+    case CAMERA_PARAM:
+        rst &= setXu(2, OB_EXT_CMD5, sizeof(OB_EXT_CMD5));
+        rst &= getXu(2, &rcvData, &rcvLen);
+        if (rst && OB_EXT_CMD5[6] == rcvData[6] && rcvData[8] == 0 && rcvData[9] == 0)
+        {
+            memcpy(recvData, rcvData + 10, rcvLen - 10);
+            *recvDataSize = rcvLen - 10;
+        }
+        break;
+    default:
+        rst = false;
+        break;
+    }
+
+    return rst;
+}
+
+bool IUvcStreamChannel::initDepthFrameProcessor()
+{
+    if (streamType_ == OBSENSOR_STREAM_DEPTH && setXu(2, OB_EXT_CMD4, sizeof(OB_EXT_CMD4)))
+    {
+        uint8_t* rcvData;
+        uint32_t rcvLen;
+        if (getXu(1, &rcvData, &rcvLen) && OB_EXT_CMD4[6] == rcvData[6] && rcvData[8] == 0 && rcvData[9] == 0)
+        {
+            depthFrameProcessor_ = makePtr<DepthFrameProcessor>(*(OBExtensionParam*)(rcvData + 10));
+            return true;
+        }
+    }
+    return false;
+}
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR_V4L2 || HAVE_OBSENSOR_MSMF
diff --git a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
new file mode 100644
index 0000000000..9bc7e1a563
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
@@ -0,0 +1,96 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef OPENCV_VIDEOIO_OBSENSOR_UVC_STREAM_CHANNEL_HPP
+#define OPENCV_VIDEOIO_OBSENSOR_UVC_STREAM_CHANNEL_HPP
+#include "obsensor_stream_channel_interface.hpp"
+
+#ifdef HAVE_OBSENSOR
+namespace cv {
+namespace obsensor {
+
+#define OBSENSOR_CAM_VID 0x2bc5 // usb vid
+#define XU_MAX_DATA_LENGTH 1024
+#define XU_UNIT_ID 4
+
+struct UvcDeviceInfo
+{
+    std::string id = ""; // uvc sub-device id
+    std::string name = "";
+    std::string uid = ""; // parent usb device id
+    uint16_t vid = 0;
+    uint16_t pid = 0;
+    uint16_t mi = 0; // uvc interface index
+};
+
+enum StreamState
+{
+    STREAM_STOPED = 0, // stoped or ready
+    STREAM_STARTING = 1,
+    STREAM_STARTED = 2,
+    STREAM_STOPPING = 3,
+};
+
+StreamType parseUvcDeviceNameToStreamType(const std::string& devName);
+FrameFormat frameFourccToFormat(uint32_t fourcc);
+uint32_t frameFormatToFourcc(FrameFormat);
+
+struct OBExtensionParam {
+    float bl;
+    float bl2;
+    float pd;
+    float ps;
+};
+
+class DepthFrameProcessor {
+public:
+    DepthFrameProcessor(const OBExtensionParam& parma);
+    ~DepthFrameProcessor() noexcept;
+    void process(Frame* frame);
+
+private:
+    const OBExtensionParam param_;
+    uint16_t* lookUpTable_;
+};
+
+class IUvcStreamChannel : public IStreamChannel {
+public:
+    IUvcStreamChannel(const UvcDeviceInfo& devInfo);
+    virtual ~IUvcStreamChannel() noexcept {}
+
+    virtual bool setProperty(int propId, const uint8_t* data, uint32_t dataSize) override;
+    virtual bool getProperty(int propId, uint8_t* recvData, uint32_t* recvDataSize) override;
+    virtual StreamType streamType() const override;
+
+protected:
+    virtual bool setXu(uint8_t ctrl, const uint8_t* data, uint32_t len) = 0;
+    virtual bool getXu(uint8_t ctrl, uint8_t** data, uint32_t* len) = 0;
+
+    bool initDepthFrameProcessor();
+
+protected:
+    const UvcDeviceInfo devInfo_;
+    StreamType streamType_;
+    Ptr<DepthFrameProcessor> depthFrameProcessor_;
+};
+}} // namespace cv::obsensor::
+#endif // HAVE_OBSENSOR
+#endif // OPENCV_VIDEOIO_OBSENSOR_UVC_STREAM_CHANNEL_HPP
diff --git a/modules/videoio/src/cap_obsensor_capture.cpp b/modules/videoio/src/cap_obsensor_capture.cpp
new file mode 100644
index 0000000000..632f75dc6c
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor_capture.cpp
@@ -0,0 +1,150 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "precomp.hpp"
+
+#include "cap_obsensor_capture.hpp"
+#include "cap_obsensor/obsensor_stream_channel_interface.hpp"
+#ifdef HAVE_OBSENSOR
+namespace cv {
+Ptr<IVideoCapture> create_obsensor_capture(int index)
+{
+    return makePtr<VideoCapture_obsensor>(index);
+}
+
+VideoCapture_obsensor::VideoCapture_obsensor(int index) : isOpened_(false)
+{
+    static const obsensor::StreamProfile colorProfile = { 640, 480, 30, obsensor::FRAME_FORMAT_MJPG };
+    static const obsensor::StreamProfile depthProfile = {640, 480, 30, obsensor::FRAME_FORMAT_Y16};
+
+    streamChannelGroup_ = obsensor::getStreamChannelGroup(index);
+    if (!streamChannelGroup_.empty())
+    {
+        for (auto& channel : streamChannelGroup_)
+        {
+            auto streamType = channel->streamType();
+            switch (streamType)
+            {
+            case obsensor::OBSENSOR_STREAM_COLOR:
+                channel->start(colorProfile, [&](obsensor::Frame* frame) {
+                    std::unique_lock<std::mutex> lk(frameMutex_);
+                    colorFrame_ = Mat(1, frame->dataSize, CV_8UC1, frame->data).clone();
+                });
+                break;
+            case obsensor::OBSENSOR_STREAM_DEPTH:
+            {
+                uint8_t data = 1;
+                channel->setProperty(obsensor::DEPTH_TO_COLOR_ALIGN, &data, 1);
+                channel->start(depthProfile, [&](obsensor::Frame* frame) {
+                    std::unique_lock<std::mutex> lk(frameMutex_);
+                    depthFrame_ = Mat(frame->height, frame->width, CV_16UC1, frame->data, frame->width * 2).clone();
+                });
+
+                uint32_t len;
+                memset(&camParam_, 0, sizeof(camParam_));
+                channel->getProperty(obsensor::CAMERA_PARAM, (uint8_t*)&camParam_, &len);
+                camParamScale_ = (int)(camParam_.p1[2] * 2 / 640 + 0.5);
+            }
+                break;
+            default:
+                break;
+            }
+        }
+        isOpened_ = true;
+    }
+}
+
+bool VideoCapture_obsensor::grabFrame()
+{
+    std::unique_lock<std::mutex> lk(frameMutex_);
+
+    grabbedDepthFrame_ = depthFrame_;
+    grabbedColorFrame_ = colorFrame_;
+
+    depthFrame_.release();
+    colorFrame_.release();
+
+    return !grabbedDepthFrame_.empty() || !grabbedColorFrame_.empty();
+}
+
+bool VideoCapture_obsensor::retrieveFrame(int outputType, OutputArray frame)
+{
+    std::unique_lock<std::mutex> lk(frameMutex_);
+    switch (outputType)
+    {
+    case CAP_OBSENSOR_DEPTH_MAP:
+        if (!grabbedDepthFrame_.empty())
+        {
+            grabbedDepthFrame_.copyTo(frame);
+            grabbedDepthFrame_.release();
+            return true;
+        }
+        break;
+    case CAP_OBSENSOR_BGR_IMAGE:
+        if (!grabbedColorFrame_.empty())
+        {
+            auto mat = imdecode(grabbedColorFrame_, IMREAD_COLOR);
+            grabbedColorFrame_.release();
+
+            if (!mat.empty())
+            {
+                mat.copyTo(frame);
+                return true;
+            }
+        }
+        break;
+    default:
+        break;
+    }
+
+    return false;
+}
+
+double VideoCapture_obsensor::getProperty(int propIdx) const {
+    double rst = 0.0;
+    propIdx = propIdx & (~CAP_OBSENSOR_GENERATORS_MASK);
+    // int gen = propIdx & CAP_OBSENSOR_GENERATORS_MASK;
+    switch (propIdx)
+    {
+    case CAP_PROP_OBSENSOR_INTRINSIC_FX:
+        rst = camParam_.p1[0] / camParamScale_;
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_FY:
+        rst = camParam_.p1[1] / camParamScale_;
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_CX:
+        rst = camParam_.p1[2] / camParamScale_;
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_CY:
+        rst = camParam_.p1[3] / camParamScale_;
+        break;
+    }
+    return rst;
+}
+
+bool VideoCapture_obsensor::setProperty(int propIdx, double /*propVal*/)
+{
+    CV_LOG_WARNING(NULL, "Unsupported or read only property, id=" << propIdx);
+    return false;
+}
+
+} // namespace cv::
+#endif // HAVE_OBSENSOR
diff --git a/modules/videoio/src/cap_obsensor_capture.hpp b/modules/videoio/src/cap_obsensor_capture.hpp
new file mode 100644
index 0000000000..6256876737
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor_capture.hpp
@@ -0,0 +1,66 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2022 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef OPENCV_VIDEOIO_CAP_OBSENSOR_CAPTURE_HPP
+#define OPENCV_VIDEOIO_CAP_OBSENSOR_CAPTURE_HPP
+
+#include <map>
+#include <mutex>
+
+#include "cap_obsensor/obsensor_stream_channel_interface.hpp"
+
+#ifdef HAVE_OBSENSOR
+namespace cv {
+class VideoCapture_obsensor : public IVideoCapture
+{
+public:
+    VideoCapture_obsensor(int index);
+    virtual ~VideoCapture_obsensor() {}
+
+    virtual double getProperty(int propIdx) const CV_OVERRIDE;
+    virtual bool setProperty(int propIdx, double propVal) CV_OVERRIDE;
+    virtual bool grabFrame() CV_OVERRIDE;
+    virtual bool retrieveFrame(int outputType, OutputArray frame) CV_OVERRIDE;
+    virtual int getCaptureDomain() CV_OVERRIDE {
+        return CAP_OBSENSOR;
+    }
+    virtual bool isOpened() const CV_OVERRIDE {
+        return isOpened_;
+    }
+
+private:
+    bool isOpened_;
+    std::vector<Ptr<obsensor::IStreamChannel>> streamChannelGroup_;
+
+    std::mutex frameMutex_;
+
+    Mat depthFrame_;
+    Mat colorFrame_;
+
+    Mat grabbedDepthFrame_;
+    Mat grabbedColorFrame_;
+
+    obsensor::CameraParam camParam_;
+    int camParamScale_;
+};
+} // namespace cv::
+#endif // HAVE_OBSENSOR
+#endif // OPENCV_VIDEOIO_CAP_OBSENSOR_CAPTURE_HPP
diff --git a/modules/videoio/src/videoio_registry.cpp b/modules/videoio/src/videoio_registry.cpp
index 6cd32edc27..52f4227396 100644
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@@ -53,7 +53,7 @@ namespace {
 /** Ordering guidelines:
 - modern optimized, multi-platform libraries: ffmpeg, gstreamer, Media SDK
 - platform specific universal SDK: WINRT, AVFOUNDATION, MSMF/DSHOW, V4L/V4L2
-- RGB-D: OpenNI/OpenNI2, REALSENSE
+- RGB-D: OpenNI/OpenNI2, REALSENSE, OBSENSOR
 - special OpenCV (file-based): "images", "mjpeg"
 - special camera SDKs, including stereo: other special SDKs: FIREWIRE/1394, XIMEA/ARAVIS/GIGANETIX/PVAPI(GigE)/uEye
 - other: XINE, gphoto2, etc
@@ -171,6 +171,11 @@ static const struct VideoBackendInfo builtin_backends[] =
 #endif
                            0)
 #endif
+
+#ifdef HAVE_OBSENSOR
+    DECLARE_STATIC_BACKEND(CAP_OBSENSOR, "OBSENSOR", MODE_CAPTURE_BY_INDEX, 0, create_obsensor_capture, 0)
+#endif
+
     // dropped backends: MIL, TYZX
 };
 
diff --git a/samples/cpp/videocapture_obsensor.cpp b/samples/cpp/videocapture_obsensor.cpp
new file mode 100644
index 0000000000..6af73de1d1
--- /dev/null
+++ b/samples/cpp/videocapture_obsensor.cpp
@@ -0,0 +1,74 @@
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+
+using namespace cv;
+int main()
+{
+    VideoCapture obsensorCapture(0, CAP_OBSENSOR);
+    if(!obsensorCapture.isOpened()){
+        std::cerr << "Failed to open obsensor capture! Index out of range or no response from device";
+        return -1;
+    }
+
+    double fx = obsensorCapture.get(CAP_PROP_OBSENSOR_INTRINSIC_FX);
+    double fy = obsensorCapture.get(CAP_PROP_OBSENSOR_INTRINSIC_FY);
+    double cx = obsensorCapture.get(CAP_PROP_OBSENSOR_INTRINSIC_CX);
+    double cy = obsensorCapture.get(CAP_PROP_OBSENSOR_INTRINSIC_CY);
+    std::cout << "obsensor camera intrinsic params: fx=" << fx << ", fy=" << fy << ", cx=" << cx << ", cy=" << cy << std::endl;
+   
+    Mat image;
+    Mat depthMap;
+    Mat adjDepthMap;
+    while (true)
+    {
+        // Grab depth map like this:
+        // obsensorCapture >> depthMap;
+
+        // Another way to grab depth map (and bgr image).
+        if (obsensorCapture.grab())
+        {
+            if (obsensorCapture.retrieve(image, CAP_OBSENSOR_BGR_IMAGE))
+            {
+                imshow("RGB", image);
+            }
+
+            if (obsensorCapture.retrieve(depthMap, CAP_OBSENSOR_DEPTH_MAP))
+            {
+                normalize(depthMap, adjDepthMap, 0, 255, NORM_MINMAX, CV_8UC1);
+                applyColorMap(adjDepthMap, adjDepthMap, COLORMAP_JET);
+                imshow("DEPTH", adjDepthMap);
+            }
+
+            // depth map overlay on bgr image
+            static const float alpha = 0.6f;
+            if (!image.empty() && !depthMap.empty())
+            {
+                normalize(depthMap, adjDepthMap, 0, 255, NORM_MINMAX, CV_8UC1);
+                cv::resize(adjDepthMap, adjDepthMap, cv::Size(image.cols, image.rows));
+                for (int i = 0; i < image.rows; i++)
+                {
+                    for (int j = 0; j < image.cols; j++)
+                    {
+                        cv::Vec3b& outRgb = image.at<cv::Vec3b>(i, j);
+                        uint8_t depthValue = 255 - adjDepthMap.at<uint8_t>(i, j);
+                        if (depthValue != 0 && depthValue != 255)
+                        {
+                            outRgb[0] = (uint8_t)(outRgb[0] * (1.0f - alpha) + depthValue * alpha);
+                            outRgb[1] = (uint8_t)(outRgb[1] * (1.0f - alpha) + depthValue *  alpha);
+                            outRgb[2] = (uint8_t)(outRgb[2] * (1.0f - alpha) + depthValue *  alpha);
+                        }
+                    }
+                }
+                imshow("DepthToColor", image);
+            }
+            image.release();
+            depthMap.release();
+        }
+
+        if (pollKey() >= 0)
+            break;
+    }
+    return 0;
+}
\ No newline at end of file

From 2c2466fb25b2b769368563f60db76042c5aadb59 Mon Sep 17 00:00:00 2001
From: Smirnov Egor <s.e.a.98@yandex.ru>
Date: Wed, 22 Jun 2022 15:04:46 +0300
Subject: [PATCH 152/178] Add symmetric circles board

---
 apps/interactive-calibration/calibCommon.hpp  |  2 +-
 .../frameProcessor.cpp                        | 22 +++++++++++++++++++
 .../frameProcessor.hpp                        |  1 +
 apps/interactive-calibration/main.cpp         |  2 +-
 .../parametersController.cpp                  |  6 ++++-
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/apps/interactive-calibration/calibCommon.hpp b/apps/interactive-calibration/calibCommon.hpp
index 617c17dbcf..f94ee4ecf9 100644
--- a/apps/interactive-calibration/calibCommon.hpp
+++ b/apps/interactive-calibration/calibCommon.hpp
@@ -21,7 +21,7 @@ namespace calib
 
     enum InputType { Video, Pictures };
     enum InputVideoSource { Camera, File };
-    enum TemplateType { AcirclesGrid, Chessboard, chAruco, DoubleAcirclesGrid };
+    enum TemplateType { AcirclesGrid, Chessboard, chAruco, DoubleAcirclesGrid, CirclesGrid };
 
     static const std::string mainWindowName = "Calibration";
     static const std::string gridWindowName = "Board locations";
diff --git a/apps/interactive-calibration/frameProcessor.cpp b/apps/interactive-calibration/frameProcessor.cpp
index 2c2a1cd5aa..85514a9136 100644
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@@ -108,6 +108,16 @@ bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
     return false;
 }
 
+bool CalibProcessor::detectAndParseCircles(const cv::Mat &frame)
+{
+    bool isTemplateFound = findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_SYMMETRIC_GRID, mBlobDetectorPtr);
+    if(isTemplateFound) {
+        mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
+        cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isTemplateFound);
+    }
+    return isTemplateFound;
+}
+
 bool CalibProcessor::detectAndParseACircles(const cv::Mat &frame)
 {
     bool isTemplateFound = findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
@@ -160,6 +170,14 @@ void CalibProcessor::saveFrameData()
         mCalibData->allCharucoCorners.push_back(mCurrentCharucoCorners);
         mCalibData->allCharucoIds.push_back(mCurrentCharucoIds);
         break;
+    case CirclesGrid:
+        objectPoints.reserve(mBoardSize.height*mBoardSize.width);
+        for( int i = 0; i < mBoardSize.height; i++ )
+            for( int j = 0; j < mBoardSize.width; j++ )
+                objectPoints.push_back(cv::Point3f(j*mSquareSize, i*mSquareSize, 0));
+        mCalibData->imagePoints.push_back(mCurrentImagePoints);
+        mCalibData->objectPoints.push_back(objectPoints);
+        break;
     case AcirclesGrid:
         objectPoints.reserve(mBoardSize.height*mBoardSize.width);
         for( int i = 0; i < mBoardSize.height; i++ )
@@ -278,6 +296,7 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
                                                         capParams.charucoMarkerSize, mArucoDictionary);
 #endif
         break;
+    case CirclesGrid:
     case AcirclesGrid:
         mBlobDetectorPtr = cv::SimpleBlobDetector::create();
         break;
@@ -308,6 +327,9 @@ cv::Mat CalibProcessor::processFrame(const cv::Mat &frame)
     case chAruco:
         isTemplateFound = detectAndParseChAruco(frameCopy);
         break;
+    case CirclesGrid:
+        isTemplateFound = detectAndParseCircles(frameCopy);
+        break;
     case AcirclesGrid:
         isTemplateFound = detectAndParseACircles(frameCopy);
         break;
diff --git a/apps/interactive-calibration/frameProcessor.hpp b/apps/interactive-calibration/frameProcessor.hpp
index 88e87f7b98..b5418cdecf 100644
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@@ -54,6 +54,7 @@ protected:
 
     bool detectAndParseChessboard(const cv::Mat& frame);
     bool detectAndParseChAruco(const cv::Mat& frame);
+    bool detectAndParseCircles(const cv::Mat& frame);
     bool detectAndParseACircles(const cv::Mat& frame);
     bool detectAndParseDualACircles(const cv::Mat& frame);
     void saveFrameData();
diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index b5c3642bb6..27e8467d1f 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -30,7 +30,7 @@ const std::string keys  =
         "{v        |         | Input from video file }"
         "{ci       | 0       | Default camera id }"
         "{flip     | false   | Vertical flip of input frames }"
-        "{t        | circles | Template for calibration (circles, chessboard, dualCircles, charuco) }"
+        "{t        | circles | Template for calibration (circles, chessboard, dualCircles, charuco, symcircles) }"
         "{sz       | 16.3    | Distance between two nearest centers of circles or squares on calibration board}"
         "{dst      | 295     | Distance between white and black parts of daulCircles template}"
         "{w        |         | Width of template (in corners or circles)}"
diff --git a/apps/interactive-calibration/parametersController.cpp b/apps/interactive-calibration/parametersController.cpp
index 5659b0e469..6124831465 100644
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@@ -107,7 +107,11 @@ bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
 
     std::string templateType = parser.get<std::string>("t");
 
-    if(templateType.find("circles", 0) == 0) {
+    if(templateType.find("symcircles", 0) == 0) {
+        mCapParams.board = CirclesGrid;
+        mCapParams.boardSize = cv::Size(4, 11);
+    }
+    else if(templateType.find("circles", 0) == 0) {
         mCapParams.board = AcirclesGrid;
         mCapParams.boardSize = cv::Size(4, 11);
     }

From 08696d92eab96ba14a16fafa4c5be96de26c1305 Mon Sep 17 00:00:00 2001
From: Smirnov Egor <s.e.a.98@yandex.ru>
Date: Mon, 4 Jul 2022 15:56:28 +0300
Subject: [PATCH 153/178] Add option to force reopen camera in interactive
 calibration tool

---
 apps/interactive-calibration/calibCommon.hpp  |  1 +
 .../interactive-calibration/calibPipeline.cpp | 71 +++++++++++++------
 apps/interactive-calibration/main.cpp         |  1 +
 .../parametersController.cpp                  |  1 +
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/apps/interactive-calibration/calibCommon.hpp b/apps/interactive-calibration/calibCommon.hpp
index f3fffa1729..d47d04eb56 100644
--- a/apps/interactive-calibration/calibCommon.hpp
+++ b/apps/interactive-calibration/calibCommon.hpp
@@ -95,6 +95,7 @@ namespace calib
         int minFramesNum;
         bool saveFrames;
         float zoom;
+        bool forceReopen;
 
         captureParameters()
         {
diff --git a/apps/interactive-calibration/calibPipeline.cpp b/apps/interactive-calibration/calibPipeline.cpp
index 3b0eedffec..ac54764e41 100644
--- a/apps/interactive-calibration/calibPipeline.cpp
+++ b/apps/interactive-calibration/calibPipeline.cpp
@@ -6,6 +6,7 @@
 
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
+#include <opencv2/core/utils/logger.hpp>
 
 #include <stdexcept>
 
@@ -30,37 +31,61 @@ CalibPipeline::CalibPipeline(captureParameters params) :
 
 PipelineExitStatus CalibPipeline::start(std::vector<cv::Ptr<FrameProcessor> > processors)
 {
-    if(mCaptureParams.source == Camera && !mCapture.isOpened())
-    {
-        mCapture.open(mCaptureParams.camID);
-        cv::Size maxRes = getCameraResolution();
-        cv::Size neededRes = mCaptureParams.cameraResolution;
-
-        if(maxRes.width < neededRes.width) {
-            double aR = (double)maxRes.width / maxRes.height;
-            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
-            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.width/aR);
-        }
-        else if(maxRes.height < neededRes.height) {
-            double aR = (double)maxRes.width / maxRes.height;
-            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
-            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.height*aR);
-        }
-        else {
-            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
-            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
+    auto open_camera = [this] () {
+        if(mCaptureParams.source == Camera)
+        {
+            mCapture.open(mCaptureParams.camID);
+            cv::Size maxRes = getCameraResolution();
+            cv::Size neededRes = mCaptureParams.cameraResolution;
+
+            if(maxRes.width < neededRes.width) {
+                double aR = (double)maxRes.width / maxRes.height;
+                mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
+                mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.width/aR);
+            }
+            else if(maxRes.height < neededRes.height) {
+                double aR = (double)maxRes.width / maxRes.height;
+                mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
+                mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.height*aR);
+            }
+            else {
+                mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
+                mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
+            }
+            mCapture.set(cv::CAP_PROP_AUTOFOCUS, 0);
         }
-        mCapture.set(cv::CAP_PROP_AUTOFOCUS, 0);
+        else if (mCaptureParams.source == File)
+            mCapture.open(mCaptureParams.videoFileName);
+    };
+
+    if(!mCapture.isOpened()) {
+        open_camera();
     }
-    else if (mCaptureParams.source == File && !mCapture.isOpened())
-        mCapture.open(mCaptureParams.videoFileName);
     mImageSize = cv::Size((int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH), (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT));
 
     if(!mCapture.isOpened())
         throw std::runtime_error("Unable to open video source");
 
     cv::Mat frame, processedFrame, resizedFrame;
-    while(mCapture.grab()) {
+    while (true) {
+        if (!mCapture.grab())
+        {
+            if (!mCaptureParams.forceReopen)
+            {
+                CV_LOG_ERROR(NULL, "VideoCapture error: could not grab the frame.");
+                break;
+            }
+
+            CV_LOG_INFO(NULL, "VideoCapture error: trying to reopen...");
+            do
+            {
+                open_camera();
+            } while (!mCapture.isOpened() || !mCapture.grab());
+
+            CV_LOG_INFO(NULL, "VideoCapture error: reopened successfully.");
+            auto newSize = cv::Size((int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH), (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT));
+            CV_CheckEQ(mImageSize, newSize, "Camera image size changed after reopening.");
+        }
         mCapture.retrieve(frame);
         if(mCaptureParams.flipVertical)
             cv::flip(frame, frame, -1);
diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index c2a6aa7298..d8b31db18a 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -42,6 +42,7 @@ const std::string keys  =
         "{pf       | defaultConfig.xml| Advanced application parameters}"
         "{save_frames | false   | Save frames that contribute to final calibration}"
         "{zoom     | 1       | Zoom factor applied to the image}"
+        "{force_reopen | false   | Forcefully reopen camera in case of errors}"
         "{help     |         | Print help}";
 
 bool calib::showOverlayMessage(const std::string& message)
diff --git a/apps/interactive-calibration/parametersController.cpp b/apps/interactive-calibration/parametersController.cpp
index b5a01b1ea8..232a16c748 100644
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@@ -91,6 +91,7 @@ bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
     mCapParams.templDst = parser.get<float>("dst");
     mCapParams.saveFrames = parser.has("save_frames");
     mCapParams.zoom = parser.get<float>("zoom");
+    mCapParams.forceReopen = parser.get<bool>("force_reopen");
 
     if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))
         return false;

From 5bf3991f55ae264d53ce623e2b4763cc315145e3 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Wed, 27 Jul 2022 10:19:34 +0300
Subject: [PATCH 154/178] Workflow on Ubuntu 14.04 for 3.4 branch

---
 .github/workflows/PR-3.4.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index 9658e4ef86..03267d987c 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -9,6 +9,9 @@ jobs:
   Ubuntu2004-ARM64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-ARM64.yaml@main
 
+  Ubuntu1404-x64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-U14.yaml@main
+
   Ubuntu2004-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-U20.yaml@main
 

From a89868928b5fcc1a7b70b8036f726434e49777af Mon Sep 17 00:00:00 2001
From: Mateusz Tabaka <mateusz.tabaka@intel.com>
Date: Wed, 27 Jul 2022 11:21:50 +0000
Subject: [PATCH 155/178] Fix compilation error on CentOS 7 (gcc 4.8)

libstdc++ that comes with gcc 4.8 doesn't
define `getline(basic_istream<char>&&, std::string&)`
even if it's part of the c++11 standard.
However we can still use the following:
`getline(basic_istream<char>&, std::string&)`.
---
 .../videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
index e947be41d7..14e9086922 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
@@ -153,7 +153,8 @@ std::vector<UvcDeviceInfo> V4L2Context::queryUvcDeviceInfoList()
                 }
                 std::istringstream(modalias.substr(5, 4)) >> std::hex >> uvcDev.vid;
                 std::istringstream(modalias.substr(10, 4)) >> std::hex >> uvcDev.pid;
-                std::getline(std::ifstream(video + "/device/interface"), uvcDev.name);
+                std::ifstream iface(video + "/device/interface");
+                std::getline(iface, uvcDev.name);
                 std::ifstream(video + "/device/bInterfaceNumber") >> uvcDev.mi;
                 uvcDevMap.insert({ interfaceRealPath, uvcDev });
             }

From 3c5377ca1b8bcc8ad395080a1e1ad52800347bb0 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Thu, 28 Jul 2022 11:21:29 +0800
Subject: [PATCH 156/178] add another Mish graph simplifier.

---
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    | 34 +++++++++++++++++++
 modules/dnn/test/test_onnx_importer.cpp       |  1 +
 2 files changed, 35 insertions(+)

diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index c6e54d6a92..5aad1c135c 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -531,6 +531,38 @@ public:
     }
 };
 
+class MishSubgraph2 : public Subgraph
+{
+public:
+    MishSubgraph2()
+    {
+        int input = addNodeToMatch("");
+        int exp = addNodeToMatch("Exp", input);
+        int addVal = addNodeToMatch("");
+        int add = addNodeToMatch("Add", addVal, exp);
+        int log = addNodeToMatch("Log", add);
+        int tanh = addNodeToMatch("Tanh", log);
+        addNodeToMatch("Mul", input, tanh);
+        setFusedNode("Mish", input);
+    }
+};
+
+class MishSubgraph3 : public Subgraph
+{
+public:
+    MishSubgraph3()
+    {
+        int input = addNodeToMatch("");
+        int exp = addNodeToMatch("Exp", input);
+        int addVal = addNodeToMatch("");
+        int add = addNodeToMatch("Add", exp, addVal);
+        int log = addNodeToMatch("Log", add);
+        int tanh = addNodeToMatch("Tanh", log);
+        addNodeToMatch("Mul", input, tanh);
+        setFusedNode("Mish", input);
+    }
+};
+
 class MulCastSubgraph : public Subgraph
 {
 public:
@@ -735,6 +767,8 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
     subgraphs.push_back(makePtr<ExpandSubgraph>());
     subgraphs.push_back(makePtr<MishSubgraph>());
+    subgraphs.push_back(makePtr<MishSubgraph2>());
+    subgraphs.push_back(makePtr<MishSubgraph3>());
     subgraphs.push_back(makePtr<NormalizeSubgraph4>());
     subgraphs.push_back(makePtr<NormalizeSubgraph5>());
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 578e0442b2..39c635a095 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1325,6 +1325,7 @@ TEST_P(Test_ONNX_layers, ResizeOpset11_Torch1_6)
 TEST_P(Test_ONNX_layers, Mish)
 {
     testONNXModels("mish");
+    testONNXModels("mish_no_softplus");
 }
 
 TEST_P(Test_ONNX_layers, CalculatePads)

From 57545653b1fa06f5fb1803915b9b1b148e720acf Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Thu, 28 Jul 2022 13:19:06 +0800
Subject: [PATCH 157/178] replace new mish impl with softplus

---
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 5aad1c135c..091d2d4ae9 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -531,35 +531,32 @@ public:
     }
 };
 
-class MishSubgraph2 : public Subgraph
+// softplus(x) = log(exp(x) + 1)
+class SoftplusSubgraph: public Subgraph
 {
 public:
-    MishSubgraph2()
+    SoftplusSubgraph()
     {
         int input = addNodeToMatch("");
         int exp = addNodeToMatch("Exp", input);
         int addVal = addNodeToMatch("");
         int add = addNodeToMatch("Add", addVal, exp);
-        int log = addNodeToMatch("Log", add);
-        int tanh = addNodeToMatch("Tanh", log);
-        addNodeToMatch("Mul", input, tanh);
-        setFusedNode("Mish", input);
+        addNodeToMatch("Log", add);
+        setFusedNode("Softplus", input);
     }
 };
 
-class MishSubgraph3 : public Subgraph
+class SoftplusSubgraph2: public Subgraph
 {
 public:
-    MishSubgraph3()
+    SoftplusSubgraph2()
     {
         int input = addNodeToMatch("");
         int exp = addNodeToMatch("Exp", input);
         int addVal = addNodeToMatch("");
         int add = addNodeToMatch("Add", exp, addVal);
-        int log = addNodeToMatch("Log", add);
-        int tanh = addNodeToMatch("Tanh", log);
-        addNodeToMatch("Mul", input, tanh);
-        setFusedNode("Mish", input);
+        addNodeToMatch("Log", add);
+        setFusedNode("Softplus", input);
     }
 };
 
@@ -766,9 +763,9 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
     subgraphs.push_back(makePtr<ExpandSubgraph>());
+    subgraphs.push_back(makePtr<SoftplusSubgraph>());
+    subgraphs.push_back(makePtr<SoftplusSubgraph2>());
     subgraphs.push_back(makePtr<MishSubgraph>());
-    subgraphs.push_back(makePtr<MishSubgraph2>());
-    subgraphs.push_back(makePtr<MishSubgraph3>());
     subgraphs.push_back(makePtr<NormalizeSubgraph4>());
     subgraphs.push_back(makePtr<NormalizeSubgraph5>());
 

From 759cbd74862775822783d0846d0e3555c8153e21 Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Thu, 28 Jul 2022 17:46:01 +0300
Subject: [PATCH 158/178] Docs workflow in GHA for 4.x

---
 .github/workflows/PR-4.x.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index cf4f8f6bc8..2142e0fb6c 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -29,3 +29,6 @@ jobs:
 
   TIM-VX:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main
+
+  docs:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-docs.yaml@main

From 500e24d6cdaeca6a165ccb62e9be032970b29988 Mon Sep 17 00:00:00 2001
From: Giles Payne <gilespayne@gmail.com>
Date: Sun, 31 Jul 2022 21:04:14 +0900
Subject: [PATCH 159/178] Fix bug ObjC/Swift Quicklook function; check only one
 channel when drawing Mat as matrix

---
 modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm    | 2 +-
 modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm
index 84cf25879b..f68a9e154e 100644
--- a/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm
+++ b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm
@@ -62,7 +62,7 @@ typedef UIFont* (*FontGetter)();
 }
 
 - (id)debugQuickLookObject {
-    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10) {
+    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10 && [self channels] == 1) {
         FontGetter fontGetters[] = { getCMU, getBodoni72, getAnySerif, getSystemFont };
         UIFont* font = nil;
         for (int fontGetterIndex = 0; font==nil && fontGetterIndex < (sizeof(fontGetters)) / (sizeof(fontGetters[0])); fontGetterIndex++) {
diff --git a/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm
index bdc5eda811..21854d67d9 100644
--- a/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm
+++ b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm
@@ -58,7 +58,7 @@ typedef NSFont* (*FontGetter)();
 
 - (id)debugQuickLookObject {
     // for smallish Mat objects display as a matrix
-    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10) {
+    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10 && [self channels] == 1) {
         FontGetter fontGetters[] = { getCMU, getBodoni72, getAnySerif, getSystemFont };
         NSFont* font = nil;
         for (int fontGetterIndex = 0; font==nil && fontGetterIndex < (sizeof(fontGetters)) / (sizeof(fontGetters[0])); fontGetterIndex++) {

From e7a787aa419225d571e1ac72c071dcf8b9dc1c37 Mon Sep 17 00:00:00 2001
From: "gbowser3@gmail.com" <gbowser3@gmail.com>
Date: Sun, 31 Jul 2022 19:22:10 -0500
Subject: [PATCH 160/178] Imgproc: Fix doxygen typo

---
 modules/imgproc/include/opencv2/imgproc.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 9ab8999e30..c04b5b67a9 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2199,7 +2199,7 @@ case of multi-channel images, each channel is processed independently.
 @param src input image; the number of channels can be arbitrary, but the depth should be one of
 CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
 @param dst output image of the same size and type as src.
-@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+@param kernel structuring element used for dilation; if element=Mat(), a 3 x 3 rectangular
 structuring element is used. Kernel can be created using #getStructuringElement
 @param anchor position of the anchor within the element; default value (-1, -1) means that the
 anchor is at the element center.

From e951edeed38c55f874d0b87a5e164c3f9cfeddf5 Mon Sep 17 00:00:00 2001
From: hzcyf <1591563365@qq.com>
Date: Mon, 1 Aug 2022 14:38:22 +0800
Subject: [PATCH 161/178] fix linkage issue when enable BUILD_opencv_world
 (#22312)

---
 .../videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
index b374424050..b9b1f05a47 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
@@ -33,6 +33,7 @@
 #pragma comment(lib, "Strmiids")
 #pragma comment(lib, "Mfreadwrite")
 #pragma comment(lib, "dxgi")
+#pragma comment(lib, "Shlwapi")
 
 namespace cv {
 namespace obsensor {

From 4c9364a8036edfaaa87f18e389e50037739730ce Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Mon, 1 Aug 2022 10:04:57 +0300
Subject: [PATCH 162/178] Docs workflow in GHA for 3.4

---
 .github/workflows/PR-3.4.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-3.4.yaml b/.github/workflows/PR-3.4.yaml
index 9658e4ef86..26201fb6d6 100644
--- a/.github/workflows/PR-3.4.yaml
+++ b/.github/workflows/PR-3.4.yaml
@@ -26,3 +26,6 @@ jobs:
 
   Android:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-Android.yaml@main
+
+  docs:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-3.4-docs.yaml@main

From d4640f464797d8b1abafc4d0c657534940db3453 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Tue, 2 Aug 2022 10:32:31 +0800
Subject: [PATCH 163/178] support ReduceLayer without reshape layer.

---
 .../dnn/include/opencv2/dnn/all_layers.hpp    |  3 +-
 modules/dnn/src/int8layers/reduce_layer.cpp   | 29 ++++++++++++++++---
 modules/dnn/src/layers/reduce_layer.cpp       | 28 +++++++++++++++---
 modules/dnn/src/onnx/onnx_importer.cpp        | 28 +++---------------
 modules/dnn/test/test_onnx_importer.cpp       |  1 -
 5 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 5c86da2be4..26fa66ebe5 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -334,7 +334,8 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         int reduceType;
-        std::vector<size_t> reduceDims;
+        // reduceDims contains the dimensions that need to be reduced, targetDims is the target output dimension.
+        std::vector<size_t> reduceDims, targetDims;
         static Ptr<ReduceLayer> create(const LayerParams& params);
     };
 
diff --git a/modules/dnn/src/int8layers/reduce_layer.cpp b/modules/dnn/src/int8layers/reduce_layer.cpp
index 935bdc0659..9ffb4897a0 100644
--- a/modules/dnn/src/int8layers/reduce_layer.cpp
+++ b/modules/dnn/src/int8layers/reduce_layer.cpp
@@ -38,6 +38,15 @@ public:
         {
             reduceDims[i] = tempDims.get<int>(i);
         }
+
+        CV_Assert(params.has("target_dims"));
+        tempDims = params.get("target_dims");
+        n = tempDims.size();
+        targetDims.resize(n);
+        for (i = 0; i < n; i++)
+        {
+            targetDims[i] = tempDims.get<int>(i);
+        }
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
@@ -161,18 +170,30 @@ public:
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
         CV_Assert(inputs.size() > 0);
-        CV_Assert(reduceDims.size() != 0 && inputs[0].size() >= reduceDims.size());
+        CV_Assert( reduceDims.size() !=0 && targetDims.size() != 0 && inputs[0].size() >= reduceDims.size());
 
-        std::vector<int> outShape;
+        // outShapeTmp can save the right number of `total(outShapeTmp)`. And the outShape is used as the final output shape.
+        std::vector<int> outShapeTmp, outShape;
+        outShape.assign(targetDims.begin(), targetDims.end());
         if (inputs[0].size() == reduceDims.size())
-            outShape.push_back(1);
+            outShapeTmp.push_back(1);
         else
         {
             for (int i = 0; i < inputs[0].size() - reduceDims.size(); i++)
             {
-                outShape.push_back(inputs[0][i]);
+                outShapeTmp.push_back(inputs[0][i]);
             }
         }
+
+        // Support dynamic shape of Batch size.
+        // Note that: when there are multiple dynamic inputs, we will give an error.
+        if (total(outShape) != total(outShapeTmp))
+        {
+            if (outShape[0] != outShapeTmp[0])
+                outShape[0] = outShapeTmp[0];
+        }
+
+        CV_Assert(total(outShape) == total(outShapeTmp));
         outputs.assign(1, outShape);
 
         return false;
diff --git a/modules/dnn/src/layers/reduce_layer.cpp b/modules/dnn/src/layers/reduce_layer.cpp
index 47aec237c7..c1f74f1cc1 100644
--- a/modules/dnn/src/layers/reduce_layer.cpp
+++ b/modules/dnn/src/layers/reduce_layer.cpp
@@ -61,6 +61,15 @@ public:
         {
             reduceDims[i] = tempDims.get<int>(i);
         }
+
+        CV_Assert(params.has("target_dims"));
+        tempDims = params.get("target_dims");
+        n = tempDims.size();
+        targetDims.resize(n);
+        for (i = 0; i < n; i++)
+        {
+            targetDims[i] = tempDims.get<int>(i);
+        }
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
@@ -325,18 +334,29 @@ public:
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
         CV_Assert(inputs.size() > 0);
-        CV_Assert(reduceDims.size() != 0 && inputs[0].size() >= reduceDims.size());
+        CV_Assert( reduceDims.size() !=0 && targetDims.size() != 0 && inputs[0].size() >= reduceDims.size());
 
-        std::vector<int> outShape;
+        // outShapeTmp can save the right number of `total(outShapeTmp)`. And the outShape is used as the final output shape.
+        std::vector<int> outShapeTmp, outShape;
+        outShape.assign(targetDims.begin(), targetDims.end());
         if (inputs[0].size() == reduceDims.size())
-            outShape.push_back(1);
+            outShapeTmp.push_back(1);
         else
         {
             for (int i = 0; i < inputs[0].size() - reduceDims.size(); i++)
             {
-                outShape.push_back(inputs[0][i]);
+                outShapeTmp.push_back(inputs[0][i]);
             }
         }
+
+        // Support dynamic shape of Batch size.
+        // Note that: when there are multiple dynamic inputs, we will give an error.
+        if (total(outShape) != total(outShapeTmp) && outShape[0] != outShapeTmp[0])
+        {
+                outShape[0] = outShapeTmp[0];
+        }
+
+        CV_Assert(total(outShape) == total(outShapeTmp));
         outputs.assign(1, outShape);
 
         return false;
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index e90581eeb5..25d4ed94a1 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1191,7 +1191,7 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
             int axesNum = axesMat.total();
             for (int i = 0; i < axesNum; i++)
             {
-                int axis = normalize_axis(static_cast<int>(axesMat.at<float>(i)), inpShape.size());
+                int axis = normalize_axis(axesMat.at<int>(i), inpShape.size());
                 shouldDelete[axis] = true;
             }
         }
@@ -1220,7 +1220,7 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
         }
     }
 
-    MatShape targetShape;
+    std::vector<int> targetShape;
     for (int i = 0; i < inpShape.size(); ++i)
     {
         if (!shouldDelete[i])
@@ -1290,30 +1290,10 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
         }
     }
 
-    LayerParams reduceLp = layerParams;
-    reduceLp.name = layerParams.name + "/reduce";
-    CV_Assert(layer_id.find(reduceLp.name) == layer_id.end());
-    reduceLp.set("deleted_dims", DictValue::arrayInt(&deletedDims[0], deletedDims.size()));
+    layerParams.set("deleted_dims", DictValue::arrayInt(&deletedDims[0], deletedDims.size()));
+    layerParams.set("target_dims", DictValue::arrayInt(&targetShape[0], targetShape.size()));
 
     node_proto.set_input(0, inputString);
-    node_proto.set_output(0, reduceLp.name);
-    addLayer(reduceLp, node_proto);
-
-    layerParams.type = (depth == CV_8S) ? "ReshapeInt8" : "Reshape";
-    layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
-
-    // Set batchsize dim as dynamic to be compatible with batch size >= 2.
-    if (targetShape.size() > 1)
-    {
-        std::vector<int> dynamicAxes = {0};  // The index of batchsize dim is 0.
-        std::vector<int> inputIndices = {0};
-
-        layerParams.set("has_dynamic_shapes", true);
-        layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
-        layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
-    }
-
-    node_proto.set_input(0, node_proto.output(0));
     node_proto.set_output(0, output_name);
 
     addLayer(layerParams, node_proto);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 5f94f9884d..6a0de29c38 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -411,7 +411,6 @@ TEST_P(Test_ONNX_layers, ReduceMean)
 TEST_P(Test_ONNX_layers, ReduceSum)
 {
     testONNXModels("reduce_sum");
-    testONNXModels("reduce_sum_axis");
     testONNXModels("reduce_sum_axis_dynamic_batch");
 }
 

From 0614c40b42c965b930a08bb04b3636c794729f8f Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Tue, 2 Aug 2022 14:58:05 +0800
Subject: [PATCH 164/178] add more skip for very long test case in test_dnn.

---
 modules/dnn/test/test_darknet_importer.cpp | 23 ++++++++++++++++++----
 modules/dnn/test/test_int8_layers.cpp      | 18 +++++++++++++----
 modules/dnn/test/test_onnx_importer.cpp    |  1 +
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 75942b4f10..f0765c955d 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -67,7 +67,10 @@ TEST(Test_Darknet, read_yolo_voc)
 
 TEST(Test_Darknet, read_yolo_voc_stream)
 {
-    applyTestTag(CV_TEST_TAG_MEMORY_1GB);
+    applyTestTag(
+            CV_TEST_TAG_MEMORY_1GB,
+            CV_TEST_TAG_DEBUG_VERYLONG
+            );
     Mat ref;
     Mat sample = imread(_tf("dog416.png"));
     Mat inp = blobFromImage(sample, 1.0/255, Size(416, 416), Scalar(), true, false);
@@ -594,7 +597,11 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets_async, Combine(
 
 TEST_P(Test_Darknet_nets, YOLOv3)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+    applyTestTag(
+            CV_TEST_TAG_LONG,
+            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_DEBUG_VERYLONG
+    );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
@@ -673,7 +680,11 @@ TEST_P(Test_Darknet_nets, YOLOv3)
 
 TEST_P(Test_Darknet_nets, YOLOv4)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+    applyTestTag(
+            CV_TEST_TAG_LONG,
+            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_DEBUG_VERYLONG
+            );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
@@ -864,7 +875,11 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
 
 TEST_P(Test_Darknet_nets, YOLOv4x_mish)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+    applyTestTag(
+            CV_TEST_TAG_LONG,
+            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_DEBUG_VERYLONG
+            );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
     // IE exception: Ngraph operation Transpose with name permute_168 has dynamic output shape on 0 port, but CPU plug-in supports only static shape
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index ab00bfba76..54db5a39ea 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -974,6 +974,7 @@ TEST_P(Test_Int8_nets, opencv_face_detector)
 
 TEST_P(Test_Int8_nets, EfficientDet)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
@@ -1095,7 +1096,7 @@ TEST_P(Test_Int8_nets, FasterRCNN_zf)
 #else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
 #endif
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
@@ -1145,7 +1146,8 @@ TEST_P(Test_Int8_nets, YoloVoc)
 #else
         CV_TEST_TAG_MEMORY_1GB,
 #endif
-        CV_TEST_TAG_LONG
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
@@ -1215,7 +1217,11 @@ TEST_P(Test_Int8_nets, TinyYoloVoc)
 
 TEST_P(Test_Int8_nets, YOLOv3)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+    applyTestTag(
+            CV_TEST_TAG_LONG,
+            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_DEBUG_VERYLONG
+            );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@@ -1255,7 +1261,11 @@ TEST_P(Test_Int8_nets, YOLOv3)
 
 TEST_P(Test_Int8_nets, YOLOv4)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+    applyTestTag(
+            CV_TEST_TAG_LONG,
+            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_DEBUG_VERYLONG
+            );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 578e0442b2..702a69da2a 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -2214,6 +2214,7 @@ TEST_P(Test_ONNX_nets, Shufflenet)
 
 TEST_P(Test_ONNX_nets, Resnet34_kinetics)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
     // IE exception: Failed to allocate graph: MYRIAD device is not opened
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)

From 129319b0bc10bdeaf3dac390585ddb8fc0e804ba Mon Sep 17 00:00:00 2001
From: Andrey Senyaev <andrey.senyaev@gmail.com>
Date: Wed, 3 Aug 2022 12:14:15 +0300
Subject: [PATCH 165/178] Fix in obsensor for VS 14

---
 .../cap_obsensor/obsensor_stream_channel_msmf.cpp    |  2 +-
 .../cap_obsensor/obsensor_stream_channel_v4l2.cpp    |  2 +-
 .../src/cap_obsensor/obsensor_uvc_stream_channel.hpp | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
index b9b1f05a47..5ac204e504 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
@@ -195,7 +195,7 @@ std::vector<UvcDeviceInfo> MFContext::queryUvcDeviceInfoList()
         std::string uid, guid;
         if (!parseUvcDeviceSymbolicLink(symbolicLink, vid, pid, mi, uid, guid))
             continue;
-        uvcDevList.emplace_back(UvcDeviceInfo({ symbolicLink, name, uid, vid, pid, mi }));
+        uvcDevList.emplace_back(UvcDeviceInfo{ symbolicLink, name, uid, vid, pid, mi });
         CV_LOG_INFO(NULL, "UVC device found: name=" << name << ", vid=" << vid << ", pid=" << pid << ", mi=" << mi << ", uid=" << uid << ", guid=" << guid);
     }
     return uvcDevList;
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
index e947be41d7..9c1686a10e 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp
@@ -101,7 +101,7 @@ std::vector<UvcDeviceInfo> V4L2Context::queryUvcDeviceInfoList()
         cv::utils::fs::glob(videosDir, "*", videos, false, true);
         for (const auto& video : videos)
         {
-            UvcDeviceInfo uvcDev;
+            UvcDeviceInfo uvcDev{};
             cv::String videoName = video.substr(video.find_last_of("/") + 1);
             char buf[PATH_MAX];
             if (realpath(video.c_str(), buf) == nullptr || cv::String(buf).find("virtual") != std::string::npos)
diff --git a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
index 9bc7e1a563..dfbb36d7e7 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
@@ -33,12 +33,12 @@ namespace obsensor {
 
 struct UvcDeviceInfo
 {
-    std::string id = ""; // uvc sub-device id
-    std::string name = "";
-    std::string uid = ""; // parent usb device id
-    uint16_t vid = 0;
-    uint16_t pid = 0;
-    uint16_t mi = 0; // uvc interface index
+    std::string id; // uvc sub-device id
+    std::string name;
+    std::string uid; // parent usb device id
+    uint16_t vid;
+    uint16_t pid;
+    uint16_t mi; // uvc interface index
 };
 
 enum StreamState

From 04e01e2b3143079674ef18339a61865ab06287c0 Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Sun, 24 Jul 2022 23:33:07 +0300
Subject: [PATCH 166/178] Add new params

---
 apps/interactive-calibration/main.cpp                     | 2 +-
 .../interactive_calibration.markdown                      | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/apps/interactive-calibration/main.cpp b/apps/interactive-calibration/main.cpp
index 071c111997..3bf4a0776e 100644
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@@ -41,7 +41,7 @@ const std::string keys  =
         "{d        | 0.8     | Min delay between captures}"
         "{pf       | defaultConfig.xml| Advanced application parameters}"
         "{save_frames | false   | Save frames that contribute to final calibration}"
-        "{zoom     | 1       | Zoom factor applied to the image}"
+        "{zoom     | 1       | Zoom factor applied to the preview image}"
         "{force_reopen | false   | Forcefully reopen camera in case of errors}"
         "{help     |         | Print help}";
 
diff --git a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
index af19fb2c01..a50058ba84 100644
--- a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
+++ b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
@@ -40,6 +40,7 @@ Supported patterns:
 -   Asymmetrical circle pattern
 -   Dual asymmetrical circle pattern
 -   chAruco (chessboard with Aruco markers)
+-   Symmetrical circle pattern
 
 Description of parameters
 ------
@@ -55,9 +56,9 @@ All of this parameters are passed to application through a command line.
 -  -v=[filename]: get video from filename, default input -- camera with id=0
 -  -ci=[0]: get video from camera with specified id
 -  -flip=[false]: vertical flip of input frames
--  -t=[circles]: pattern for calibration (circles, chessboard, dualCircles, chAruco)
+-  -t=[circles]: pattern for calibration (circles, chessboard, dualCircles, chAruco, symcircles)
 -  -sz=[16.3]: distance between two nearest centers of circles or squares on calibration board
--  -dst=[295] distance between white and black parts of daulCircles pattern
+-  -dst=[295] distance between white and black parts of dualCircles pattern
 -  -w=[width]: width of pattern (in corners or circles)
 -  -h=[height]: height of pattern (in corners or circles)
 -  -of=[camParams.xml]: output file name
@@ -65,6 +66,9 @@ All of this parameters are passed to application through a command line.
 -  -vis=[grid]: captured boards visualization (grid, window)
 -  -d=[0.8]: delay between captures in seconds
 -  -pf=[defaultConfig.xml]: advanced application parameters file
+-  -force_reopen=[false]: Forcefully reopen camera in case of errors. Can be helpful for ip cameras with unstable connection.
+-  -save_frames=[false]: Save frames that contribute to final calibration
+-  -zoom=[1]: Zoom factor applied to the preview image
 
 ### Advanced parameters:
 

From 06431655197b1cec869d3291436b0610dd15e1a7 Mon Sep 17 00:00:00 2001
From: ocpalo <iamberkeyavas@gmail.com>
Date: Fri, 5 Aug 2022 23:13:06 +0300
Subject: [PATCH 167/178] update libjpeg-turbo version to 2.1.3

---
 3rdparty/libjpeg-turbo/CMakeLists.txt         |   19 +-
 3rdparty/libjpeg-turbo/LICENSE.md             |    2 +-
 3rdparty/libjpeg-turbo/src/jcapimin.c         |    6 +-
 3rdparty/libjpeg-turbo/src/jcarith.c          |   12 +-
 3rdparty/libjpeg-turbo/src/jchuff.c           |   22 +-
 3rdparty/libjpeg-turbo/src/jcphuff.c          |   10 +-
 3rdparty/libjpeg-turbo/src/jcprepct.c         |    8 +-
 3rdparty/libjpeg-turbo/src/jctrans.c          |    6 +-
 3rdparty/libjpeg-turbo/src/jdapimin.c         |    6 +-
 3rdparty/libjpeg-turbo/src/jdapistd.c         |    8 +-
 3rdparty/libjpeg-turbo/src/jdarith.c          |   22 +-
 3rdparty/libjpeg-turbo/src/jdatadst.c         |   13 +-
 3rdparty/libjpeg-turbo/src/jdatasrc.c         |    4 +-
 3rdparty/libjpeg-turbo/src/jddctmgr.c         |    4 +-
 3rdparty/libjpeg-turbo/src/jdicc.c            |    4 -
 3rdparty/libjpeg-turbo/src/jdinput.c          |    4 +-
 3rdparty/libjpeg-turbo/src/jdmarker.c         |    8 +-
 3rdparty/libjpeg-turbo/src/jdmaster.c         |   12 +-
 3rdparty/libjpeg-turbo/src/jdphuff.c          |   10 +-
 3rdparty/libjpeg-turbo/src/jerror.c           |   16 +-
 3rdparty/libjpeg-turbo/src/jerror.h           |    6 +-
 3rdparty/libjpeg-turbo/src/jinclude.h         |  133 +-
 3rdparty/libjpeg-turbo/src/jmemmgr.c          |   16 +-
 3rdparty/libjpeg-turbo/src/jmemnobs.c         |    5 -
 3rdparty/libjpeg-turbo/src/jmorecfg.h         |    4 -
 3rdparty/libjpeg-turbo/src/jpegint.h          |    9 -
 3rdparty/libjpeg-turbo/src/jstdhuff.c         |    9 +-
 3rdparty/libjpeg-turbo/src/jutils.c           |   10 +-
 3rdparty/libjpeg-turbo/src/jversion.h.in      |   54 +
 .../libjpeg-turbo/src/simd/CMakeLists.txt     |  540 ++
 .../src/simd/arm/aarch32/jccolext-neon.c      |  148 +
 .../src/simd/arm/aarch32/jchuff-neon.c        |  334 ++
 .../src/simd/arm/aarch32/jsimd.c              |  980 ++++
 .../src/simd/arm/aarch32/jsimd_neon.S         | 1200 +++++
 .../src/simd/arm/aarch64/jccolext-neon.c      |  316 ++
 .../src/simd/arm/aarch64/jchuff-neon.c        |  411 ++
 .../src/simd/arm/aarch64/jsimd.c              | 1058 ++++
 .../src/simd/arm/aarch64/jsimd_neon.S         | 2254 ++++++++
 3rdparty/libjpeg-turbo/src/simd/arm/align.h   |   28 +
 .../libjpeg-turbo/src/simd/arm/jccolor-neon.c |  160 +
 .../libjpeg-turbo/src/simd/arm/jcgray-neon.c  |  120 +
 .../src/simd/arm/jcgryext-neon.c              |  106 +
 3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h  |  131 +
 .../libjpeg-turbo/src/simd/arm/jcphuff-neon.c |  622 +++
 .../src/simd/arm/jcsample-neon.c              |  192 +
 .../src/simd/arm/jdcolext-neon.c              |  374 ++
 .../libjpeg-turbo/src/simd/arm/jdcolor-neon.c |  142 +
 .../libjpeg-turbo/src/simd/arm/jdmerge-neon.c |  145 +
 .../src/simd/arm/jdmrgext-neon.c              |  723 +++
 .../src/simd/arm/jdsample-neon.c              |  569 +++
 .../src/simd/arm/jfdctfst-neon.c              |  214 +
 .../src/simd/arm/jfdctint-neon.c              |  376 ++
 .../src/simd/arm/jidctfst-neon.c              |  472 ++
 .../src/simd/arm/jidctint-neon.c              |  802 +++
 .../src/simd/arm/jidctred-neon.c              |  486 ++
 .../libjpeg-turbo/src/simd/arm/jquanti-neon.c |  193 +
 .../src/simd/arm/neon-compat.h.in             |   37 +
 .../src/simd/i386/jccolext-avx2.asm           |  578 +++
 .../src/simd/i386/jccolext-mmx.asm            |  476 ++
 .../src/simd/i386/jccolext-sse2.asm           |  503 ++
 .../src/simd/i386/jccolor-avx2.asm            |  121 +
 .../src/simd/i386/jccolor-mmx.asm             |  121 +
 .../src/simd/i386/jccolor-sse2.asm            |  120 +
 .../src/simd/i386/jcgray-avx2.asm             |  113 +
 .../src/simd/i386/jcgray-mmx.asm              |  113 +
 .../src/simd/i386/jcgray-sse2.asm             |  112 +
 .../src/simd/i386/jcgryext-avx2.asm           |  457 ++
 .../src/simd/i386/jcgryext-mmx.asm            |  355 ++
 .../src/simd/i386/jcgryext-sse2.asm           |  382 ++
 .../src/simd/i386/jchuff-sse2.asm             |  761 +++
 .../src/simd/i386/jcphuff-sse2.asm            |  662 +++
 .../src/simd/i386/jcsample-avx2.asm           |  388 ++
 .../src/simd/i386/jcsample-mmx.asm            |  324 ++
 .../src/simd/i386/jcsample-sse2.asm           |  351 ++
 .../src/simd/i386/jdcolext-avx2.asm           |  515 ++
 .../src/simd/i386/jdcolext-mmx.asm            |  404 ++
 .../src/simd/i386/jdcolext-sse2.asm           |  458 ++
 .../src/simd/i386/jdcolor-avx2.asm            |  118 +
 .../src/simd/i386/jdcolor-mmx.asm             |  117 +
 .../src/simd/i386/jdcolor-sse2.asm            |  117 +
 .../src/simd/i386/jdmerge-avx2.asm            |  136 +
 .../src/simd/i386/jdmerge-mmx.asm             |  123 +
 .../src/simd/i386/jdmerge-sse2.asm            |  135 +
 .../src/simd/i386/jdmrgext-avx2.asm           |  575 +++
 .../src/simd/i386/jdmrgext-mmx.asm            |  460 ++
 .../src/simd/i386/jdmrgext-sse2.asm           |  517 ++
 .../src/simd/i386/jdsample-avx2.asm           |  760 +++
 .../src/simd/i386/jdsample-mmx.asm            |  731 +++
 .../src/simd/i386/jdsample-sse2.asm           |  724 +++
 .../src/simd/i386/jfdctflt-3dn.asm            |  318 ++
 .../src/simd/i386/jfdctflt-sse.asm            |  369 ++
 .../src/simd/i386/jfdctfst-mmx.asm            |  395 ++
 .../src/simd/i386/jfdctfst-sse2.asm           |  403 ++
 .../src/simd/i386/jfdctint-avx2.asm           |  331 ++
 .../src/simd/i386/jfdctint-mmx.asm            |  620 +++
 .../src/simd/i386/jfdctint-sse2.asm           |  633 +++
 .../src/simd/i386/jidctflt-3dn.asm            |  451 ++
 .../src/simd/i386/jidctflt-sse.asm            |  571 +++
 .../src/simd/i386/jidctflt-sse2.asm           |  497 ++
 .../src/simd/i386/jidctfst-mmx.asm            |  499 ++
 .../src/simd/i386/jidctfst-sse2.asm           |  501 ++
 .../src/simd/i386/jidctint-avx2.asm           |  453 ++
 .../src/simd/i386/jidctint-mmx.asm            |  851 +++
 .../src/simd/i386/jidctint-sse2.asm           |  858 ++++
 .../src/simd/i386/jidctred-mmx.asm            |  704 +++
 .../src/simd/i386/jidctred-sse2.asm           |  592 +++
 .../src/simd/i386/jquant-3dn.asm              |  230 +
 .../src/simd/i386/jquant-mmx.asm              |  276 +
 .../src/simd/i386/jquant-sse.asm              |  208 +
 .../src/simd/i386/jquantf-sse2.asm            |  168 +
 .../src/simd/i386/jquanti-avx2.asm            |  188 +
 .../src/simd/i386/jquanti-sse2.asm            |  201 +
 3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c  | 1246 +++++
 .../libjpeg-turbo/src/simd/i386/jsimdcpu.asm  |  135 +
 3rdparty/libjpeg-turbo/src/simd/jsimd.h       | 1258 +++++
 3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c  | 1147 +++++
 .../libjpeg-turbo/src/simd/mips/jsimd_dspr2.S | 4543 +++++++++++++++++
 .../src/simd/mips/jsimd_dspr2_asm.h           |  292 ++
 .../src/simd/mips64/jccolext-mmi.c            |  455 ++
 .../src/simd/mips64/jccolor-mmi.c             |  148 +
 .../src/simd/mips64/jcgray-mmi.c              |  132 +
 .../src/simd/mips64/jcgryext-mmi.c            |  374 ++
 .../src/simd/mips64/jcsample-mmi.c            |   98 +
 .../libjpeg-turbo/src/simd/mips64/jcsample.h  |   28 +
 .../src/simd/mips64/jdcolext-mmi.c            |  415 ++
 .../src/simd/mips64/jdcolor-mmi.c             |  139 +
 .../src/simd/mips64/jdmerge-mmi.c             |  149 +
 .../src/simd/mips64/jdmrgext-mmi.c            |  615 +++
 .../src/simd/mips64/jdsample-mmi.c            |  304 ++
 .../src/simd/mips64/jfdctfst-mmi.c            |  255 +
 .../src/simd/mips64/jfdctint-mmi.c            |  398 ++
 .../src/simd/mips64/jidctfst-mmi.c            |  395 ++
 .../src/simd/mips64/jidctint-mmi.c            |  571 +++
 .../src/simd/mips64/jquanti-mmi.c             |  124 +
 .../libjpeg-turbo/src/simd/mips64/jsimd.c     |  870 ++++
 .../libjpeg-turbo/src/simd/mips64/jsimd_mmi.h |   69 +
 .../src/simd/mips64/loongson-mmintrin.h       | 1334 +++++
 .../libjpeg-turbo/src/simd/nasm/jcolsamp.inc  |  135 +
 3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc |   31 +
 .../libjpeg-turbo/src/simd/nasm/jsimdcfg.inc  |   93 +
 .../src/simd/nasm/jsimdcfg.inc.h              |  133 +
 .../libjpeg-turbo/src/simd/nasm/jsimdext.inc  |  520 ++
 .../src/simd/powerpc/jccolext-altivec.c       |  269 +
 .../src/simd/powerpc/jccolor-altivec.c        |  116 +
 .../src/simd/powerpc/jcgray-altivec.c         |  111 +
 .../src/simd/powerpc/jcgryext-altivec.c       |  228 +
 .../src/simd/powerpc/jcsample-altivec.c       |  159 +
 .../libjpeg-turbo/src/simd/powerpc/jcsample.h |   28 +
 .../src/simd/powerpc/jdcolext-altivec.c       |  276 +
 .../src/simd/powerpc/jdcolor-altivec.c        |  106 +
 .../src/simd/powerpc/jdmerge-altivec.c        |  130 +
 .../src/simd/powerpc/jdmrgext-altivec.c       |  329 ++
 .../src/simd/powerpc/jdsample-altivec.c       |  400 ++
 .../src/simd/powerpc/jfdctfst-altivec.c       |  154 +
 .../src/simd/powerpc/jfdctint-altivec.c       |  258 +
 .../src/simd/powerpc/jidctfst-altivec.c       |  255 +
 .../src/simd/powerpc/jidctint-altivec.c       |  357 ++
 .../src/simd/powerpc/jquanti-altivec.c        |  250 +
 .../libjpeg-turbo/src/simd/powerpc/jsimd.c    |  881 ++++
 .../src/simd/powerpc/jsimd_altivec.h          |   98 +
 .../src/simd/x86_64/jccolext-avx2.asm         |  559 ++
 .../src/simd/x86_64/jccolext-sse2.asm         |  484 ++
 .../src/simd/x86_64/jccolor-avx2.asm          |  121 +
 .../src/simd/x86_64/jccolor-sse2.asm          |  120 +
 .../src/simd/x86_64/jcgray-avx2.asm           |  113 +
 .../src/simd/x86_64/jcgray-sse2.asm           |  112 +
 .../src/simd/x86_64/jcgryext-avx2.asm         |  438 ++
 .../src/simd/x86_64/jcgryext-sse2.asm         |  363 ++
 .../src/simd/x86_64/jchuff-sse2.asm           |  583 +++
 .../src/simd/x86_64/jcphuff-sse2.asm          |  639 +++
 .../src/simd/x86_64/jcsample-avx2.asm         |  367 ++
 .../src/simd/x86_64/jcsample-sse2.asm         |  330 ++
 .../src/simd/x86_64/jdcolext-avx2.asm         |  496 ++
 .../src/simd/x86_64/jdcolext-sse2.asm         |  439 ++
 .../src/simd/x86_64/jdcolor-avx2.asm          |  118 +
 .../src/simd/x86_64/jdcolor-sse2.asm          |  117 +
 .../src/simd/x86_64/jdmerge-avx2.asm          |  136 +
 .../src/simd/x86_64/jdmerge-sse2.asm          |  135 +
 .../src/simd/x86_64/jdmrgext-avx2.asm         |  596 +++
 .../src/simd/x86_64/jdmrgext-sse2.asm         |  538 ++
 .../src/simd/x86_64/jdsample-avx2.asm         |  696 +++
 .../src/simd/x86_64/jdsample-sse2.asm         |  665 +++
 .../src/simd/x86_64/jfdctflt-sse.asm          |  355 ++
 .../src/simd/x86_64/jfdctfst-sse2.asm         |  389 ++
 .../src/simd/x86_64/jfdctint-avx2.asm         |  320 ++
 .../src/simd/x86_64/jfdctint-sse2.asm         |  619 +++
 .../src/simd/x86_64/jidctflt-sse2.asm         |  482 ++
 .../src/simd/x86_64/jidctfst-sse2.asm         |  491 ++
 .../src/simd/x86_64/jidctint-avx2.asm         |  418 ++
 .../src/simd/x86_64/jidctint-sse2.asm         |  847 +++
 .../src/simd/x86_64/jidctred-sse2.asm         |  574 +++
 .../src/simd/x86_64/jquantf-sse2.asm          |  155 +
 .../src/simd/x86_64/jquanti-avx2.asm          |  163 +
 .../src/simd/x86_64/jquanti-sse2.asm          |  188 +
 .../libjpeg-turbo/src/simd/x86_64/jsimd.c     | 1068 ++++
 .../src/simd/x86_64/jsimdcpu.asm              |   86 +
 196 files changed, 70668 insertions(+), 182 deletions(-)
 create mode 100644 3rdparty/libjpeg-turbo/src/jversion.h.in
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/align.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/jsimd.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
 create mode 100644 3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm

diff --git a/3rdparty/libjpeg-turbo/CMakeLists.txt b/3rdparty/libjpeg-turbo/CMakeLists.txt
index 73e1eee141..4dd3095f94 100644
--- a/3rdparty/libjpeg-turbo/CMakeLists.txt
+++ b/3rdparty/libjpeg-turbo/CMakeLists.txt
@@ -4,9 +4,9 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-6
 
 set(VERSION_MAJOR 2)
 set(VERSION_MINOR 1)
-set(VERSION_REVISION 2)
+set(VERSION_REVISION 3)
 set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION})
-set(LIBJPEG_TURBO_VERSION_NUMBER 2001002)
+set(LIBJPEG_TURBO_VERSION_NUMBER 2001003)
 
 string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo")
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -79,14 +79,13 @@ configure_file(jconfigint.h.in jconfigint.h)
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-set(JPEG_SOURCES
-  jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c jcicc.c
-  jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
-  jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
-  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c jdmainct.c jdmarker.c
-  jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
-  jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
-  jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
+set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
+        jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
+        jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c
+        jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c
+        jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c
+        jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
 
 if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
   set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
diff --git a/3rdparty/libjpeg-turbo/LICENSE.md b/3rdparty/libjpeg-turbo/LICENSE.md
index a1cdad52fa..d753e1d76a 100644
--- a/3rdparty/libjpeg-turbo/LICENSE.md
+++ b/3rdparty/libjpeg-turbo/LICENSE.md
@@ -91,7 +91,7 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)2009-2021 D. R. Commander.  All Rights Reserved.<br>
+Copyright (C)2009-2022 D. R. Commander.  All Rights Reserved.<br>
 Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/3rdparty/libjpeg-turbo/src/jcapimin.c b/3rdparty/libjpeg-turbo/src/jcapimin.c
index 178c55ba47..84e7ecc9a7 100644
--- a/3rdparty/libjpeg-turbo/src/jcapimin.c
+++ b/3rdparty/libjpeg-turbo/src/jcapimin.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -52,7 +52,7 @@ jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
   {
     struct jpeg_error_mgr *err = cinfo->err;
     void *client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
+    memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
diff --git a/3rdparty/libjpeg-turbo/src/jcarith.c b/3rdparty/libjpeg-turbo/src/jcarith.c
index b6d093f70e..b1720521bf 100644
--- a/3rdparty/libjpeg-turbo/src/jcarith.c
+++ b/3rdparty/libjpeg-turbo/src/jcarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, 2018, D. R. Commander.
+ * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -338,14 +338,14 @@ emit_restart(j_compress_ptr cinfo, int restart_num)
     compptr = cinfo->cur_comp_info[ci];
     /* DC needs no table for refinement scan */
     if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     /* AC needs no table when not present */
     if (cinfo->progressive_mode == 0 || cinfo->Se) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -836,7 +836,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
      * We are fully adaptive here and need no extra
      * statistics gathering pass!
      */
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* We assume jcmaster.c already validated the progressive scan parameters. */
 
@@ -867,7 +867,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
       if (entropy->dc_stats[tbl] == NULL)
         entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
           ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -880,7 +880,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
       if (entropy->ac_stats[tbl] == NULL)
         entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
           ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
       if (cinfo->progressive_mode)
         /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
diff --git a/3rdparty/libjpeg-turbo/src/jchuff.c b/3rdparty/libjpeg-turbo/src/jchuff.c
index 8ff817b151..f4dfa1cb54 100644
--- a/3rdparty/libjpeg-turbo/src/jchuff.c
+++ b/3rdparty/libjpeg-turbo/src/jchuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * Copyright (C) 2018, Matthias Räncker.
  * Copyright (C) 2020, Arm Limited.
@@ -200,12 +200,12 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
         entropy->dc_count_ptrs[dctbl] = (long *)
           (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
+      memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
       if (entropy->ac_count_ptrs[actbl] == NULL)
         entropy->ac_count_ptrs[actbl] = (long *)
           (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
+      memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
 #endif
     } else {
       /* Compute derived values for Huffman tables */
@@ -315,8 +315,8 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
-  MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco));
-  MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
+  memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
+  memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
    * and duplicated VAL entries.  We allow 0..255 for AC symbols
@@ -478,7 +478,7 @@ dump_buffer(working_state *state)
     buffer = _buffer; \
     while (bytes > 0) { \
       bytestocopy = MIN(bytes, state->free_in_buffer); \
-      MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
+      memcpy(state->next_output_byte, buffer, bytestocopy); \
       state->next_output_byte += bytestocopy; \
       buffer += bytestocopy; \
       state->free_in_buffer -= bytestocopy; \
@@ -941,8 +941,8 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
   /* This algorithm is explained in section K.2 of the JPEG standard */
 
-  MEMZERO(bits, sizeof(bits));
-  MEMZERO(codesize, sizeof(codesize));
+  memset(bits, 0, sizeof(bits));
+  memset(codesize, 0, sizeof(codesize));
   for (i = 0; i < 257; i++)
     others[i] = -1;             /* init links to empty */
 
@@ -1044,7 +1044,7 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
   bits[i]--;
 
   /* Return final symbol counts (only for lengths 0..16) */
-  MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
+  memcpy(htbl->bits, bits, sizeof(htbl->bits));
 
   /* Return a list of the symbols sorted by code length */
   /* It's not real clear to me why we don't need to consider the codelength
@@ -1083,8 +1083,8 @@ finish_pass_gather(j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did_dc, sizeof(did_dc));
-  MEMZERO(did_ac, sizeof(did_ac));
+  memset(did_dc, 0, sizeof(did_dc));
+  memset(did_ac, 0, sizeof(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
diff --git a/3rdparty/libjpeg-turbo/src/jcphuff.c b/3rdparty/libjpeg-turbo/src/jcphuff.c
index 1101987180..872e570bff 100644
--- a/3rdparty/libjpeg-turbo/src/jcphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jcphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
  * Copyright (C) 2016, 2018, Matthieu Darbois.
  * Copyright (C) 2020, Arm Limited.
  * Copyright (C) 2021, Alex Richardson.
@@ -275,7 +275,7 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
         entropy->count_ptrs[tbl] = (long *)
           (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
+      memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
     } else {
       /* Compute derived values for Huffman table */
       /* We may do this more than once for a table, but it's not expensive */
@@ -584,8 +584,8 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       continue; \
     /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
     temp2 ^= temp; \
-    values[k] = temp; \
-    values[k + DCTSIZE2] = temp2; \
+    values[k] = (JCOEF)temp; \
+    values[k + DCTSIZE2] = (JCOEF)temp2; \
     zerobits |= ((size_t)1U) << k; \
   } \
 }
@@ -1062,7 +1062,7 @@ finish_pass_gather_phuff(j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did, sizeof(did));
+  memset(did, 0, sizeof(did));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
diff --git a/3rdparty/libjpeg-turbo/src/jcprepct.c b/3rdparty/libjpeg-turbo/src/jcprepct.c
index d59713ae68..f27cc34507 100644
--- a/3rdparty/libjpeg-turbo/src/jcprepct.c
+++ b/3rdparty/libjpeg-turbo/src/jcprepct.c
@@ -3,8 +3,8 @@
  *
  * This file is part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -289,8 +289,8 @@ create_context_buffer(j_compress_ptr cinfo)
                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION)(3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
-    MEMCOPY(fake_buffer + rgroup_height, true_buffer,
-            3 * rgroup_height * sizeof(JSAMPROW));
+    memcpy(fake_buffer + rgroup_height, true_buffer,
+           3 * rgroup_height * sizeof(JSAMPROW));
     /* Fill in the above and below wraparound pointers */
     for (i = 0; i < rgroup_height; i++) {
       fake_buffer[i] = true_buffer[2 * rgroup_height + i];
diff --git a/3rdparty/libjpeg-turbo/src/jctrans.c b/3rdparty/libjpeg-turbo/src/jctrans.c
index ab6a2186db..e121028ec7 100644
--- a/3rdparty/libjpeg-turbo/src/jctrans.c
+++ b/3rdparty/libjpeg-turbo/src/jctrans.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1995-1998, Thomas G. Lane.
  * Modified 2000-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2020, D. R. Commander.
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -100,8 +100,8 @@ jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
       qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
       if (*qtblptr == NULL)
         *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
-      MEMCOPY((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
-              sizeof((*qtblptr)->quantval));
+      memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
+             sizeof((*qtblptr)->quantval));
       (*qtblptr)->sent_table = FALSE;
     }
   }
diff --git a/3rdparty/libjpeg-turbo/src/jdapimin.c b/3rdparty/libjpeg-turbo/src/jdapimin.c
index 4609b1322f..f50c27edc3 100644
--- a/3rdparty/libjpeg-turbo/src/jdapimin.c
+++ b/3rdparty/libjpeg-turbo/src/jdapimin.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -53,7 +53,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
   {
     struct jpeg_error_mgr *err = cinfo->err;
     void *client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
+    memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
@@ -92,7 +92,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
   cinfo->master = (struct jpeg_decomp_master *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_decomp_master));
-  MEMZERO(cinfo->master, sizeof(my_decomp_master));
+  memset(cinfo->master, 0, sizeof(my_decomp_master));
 }
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jdapistd.c b/3rdparty/libjpeg-turbo/src/jdapistd.c
index 695a620099..8827d8abf5 100644
--- a/3rdparty/libjpeg-turbo/src/jdapistd.c
+++ b/3rdparty/libjpeg-turbo/src/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -159,6 +159,7 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
   JDIMENSION input_xoffset;
   boolean reinit_upsampler = FALSE;
   jpeg_component_info *compptr;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -208,6 +209,11 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
    */
   *width = *width + input_xoffset - *xoffset;
   cinfo->output_width = *width;
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+    upsample->out_row_width =
+      cinfo->output_width * cinfo->out_color_components;
+  }
 
   /* Set the first and last iMCU columns that we must decompress.  These values
    * will be used in single-scan decompressions.
diff --git a/3rdparty/libjpeg-turbo/src/jdarith.c b/3rdparty/libjpeg-turbo/src/jdarith.c
index 7f0d3a785c..21575e80c7 100644
--- a/3rdparty/libjpeg-turbo/src/jdarith.c
+++ b/3rdparty/libjpeg-turbo/src/jdarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2020, D. R. Commander.
+ * Copyright (C) 2015-2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -210,13 +210,13 @@ process_restart(j_decompress_ptr cinfo)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     if (!cinfo->progressive_mode || cinfo->Ss) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -471,17 +471,17 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       if (*thiscoef) {                          /* previously nonzero coef */
         if (arith_decode(cinfo, st + 2)) {
           if (*thiscoef < 0)
-            *thiscoef += m1;
+            *thiscoef += (JCOEF)m1;
           else
-            *thiscoef += p1;
+            *thiscoef += (JCOEF)p1;
         }
         break;
       }
       if (arith_decode(cinfo, st + 1)) {        /* newly nonzero coef */
         if (arith_decode(cinfo, entropy->fixed_bin))
-          *thiscoef = m1;
+          *thiscoef = (JCOEF)m1;
         else
-          *thiscoef = p1;
+          *thiscoef = (JCOEF)p1;
         break;
       }
       st += 3;  k++;
@@ -698,8 +698,8 @@ bad:
     /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
      * This ought to be an error condition, but we make it a warning.
      */
-    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
-        (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+    if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+        cinfo->Ah != 0 || cinfo->Al != 0)
       WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
     /* Select MCU decoding routine */
     entropy->pub.decode_mcu = decode_mcu;
@@ -715,7 +715,7 @@ bad:
       if (entropy->dc_stats[tbl] == NULL)
         entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
           ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -727,7 +727,7 @@ bad:
       if (entropy->ac_stats[tbl] == NULL)
         entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
           ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
     }
   }
 
diff --git a/3rdparty/libjpeg-turbo/src/jdatadst.c b/3rdparty/libjpeg-turbo/src/jdatadst.c
index 246fffb58a..6b4fed2339 100644
--- a/3rdparty/libjpeg-turbo/src/jdatadst.c
+++ b/3rdparty/libjpeg-turbo/src/jdatadst.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,11 +23,6 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc(size_t size);
-extern void free(void *ptr);
-#endif
-
 
 /* Expanded data destination object for stdio output */
 
@@ -116,7 +111,7 @@ empty_output_buffer(j_compress_ptr cinfo)
 {
   my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
-  if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
+  if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
       (size_t)OUTPUT_BUF_SIZE)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
@@ -141,7 +136,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
   if (nextbuffer == NULL)
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
 
-  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+  memcpy(nextbuffer, dest->buffer, dest->bufsize);
 
   free(dest->newbuffer);
 
@@ -175,7 +170,7 @@ term_destination(j_compress_ptr cinfo)
 
   /* Write any data remaining in the buffer */
   if (datacount > 0) {
-    if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
+    if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
       ERREXIT(cinfo, JERR_FILE_WRITE);
   }
   fflush(dest->outfile);
diff --git a/3rdparty/libjpeg-turbo/src/jdatasrc.c b/3rdparty/libjpeg-turbo/src/jdatasrc.c
index eadb4a2c90..e36a30d894 100644
--- a/3rdparty/libjpeg-turbo/src/jdatasrc.c
+++ b/3rdparty/libjpeg-turbo/src/jdatasrc.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -104,7 +104,7 @@ fill_input_buffer(j_decompress_ptr cinfo)
   my_src_ptr src = (my_src_ptr)cinfo->src;
   size_t nbytes;
 
-  nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
+  nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
 
   if (nbytes <= 0) {
     if (src->start_of_file)     /* Treat empty input file as fatal error */
diff --git a/3rdparty/libjpeg-turbo/src/jddctmgr.c b/3rdparty/libjpeg-turbo/src/jddctmgr.c
index 266f446623..e78d7bebe2 100644
--- a/3rdparty/libjpeg-turbo/src/jddctmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jddctmgr.c
@@ -6,7 +6,7 @@
  * Modified 2002-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2010, 2015, 2022, D. R. Commander.
  * Copyright (C) 2013, MIPS Technologies, Inc., California.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -345,7 +345,7 @@ jinit_inverse_dct(j_decompress_ptr cinfo)
     compptr->dct_table =
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(multiplier_table));
-    MEMZERO(compptr->dct_table, sizeof(multiplier_table));
+    memset(compptr->dct_table, 0, sizeof(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
diff --git a/3rdparty/libjpeg-turbo/src/jdicc.c b/3rdparty/libjpeg-turbo/src/jdicc.c
index a1a5b867ae..50aa9a9676 100644
--- a/3rdparty/libjpeg-turbo/src/jdicc.c
+++ b/3rdparty/libjpeg-turbo/src/jdicc.c
@@ -18,10 +18,6 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
-extern void *malloc(size_t size);
-#endif
-
 
 #define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
 #define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
diff --git a/3rdparty/libjpeg-turbo/src/jdinput.c b/3rdparty/libjpeg-turbo/src/jdinput.c
index deec618f26..1bc5aff1a7 100644
--- a/3rdparty/libjpeg-turbo/src/jdinput.c
+++ b/3rdparty/libjpeg-turbo/src/jdinput.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -264,7 +264,7 @@ latch_quant_tables(j_decompress_ptr cinfo)
     qtbl = (JQUANT_TBL *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(JQUANT_TBL));
-    MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
+    memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
     compptr->quant_table = qtbl;
   }
 }
diff --git a/3rdparty/libjpeg-turbo/src/jdmarker.c b/3rdparty/libjpeg-turbo/src/jdmarker.c
index b964c3a1a6..f7eba615fd 100644
--- a/3rdparty/libjpeg-turbo/src/jdmarker.c
+++ b/3rdparty/libjpeg-turbo/src/jdmarker.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2012, 2015, D. R. Commander.
+ * Copyright (C) 2012, 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -473,7 +473,7 @@ get_dht(j_decompress_ptr cinfo)
     for (i = 0; i < count; i++)
       INPUT_BYTE(cinfo, huffval[i], return FALSE);
 
-    MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
+    memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
 
     length -= count;
 
@@ -491,8 +491,8 @@ get_dht(j_decompress_ptr cinfo)
     if (*htblptr == NULL)
       *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
 
-    MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
-    MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
+    memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+    memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
   }
 
   if (length != 0)
diff --git a/3rdparty/libjpeg-turbo/src/jdmaster.c b/3rdparty/libjpeg-turbo/src/jdmaster.c
index cbc8774b1f..a3690bf560 100644
--- a/3rdparty/libjpeg-turbo/src/jdmaster.c
+++ b/3rdparty/libjpeg-turbo/src/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -417,7 +417,7 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
   table += (MAXJSAMPLE + 1);    /* allow negative subscripts of simple table */
   cinfo->sample_range_limit = table;
   /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  MEMZERO(table - (MAXJSAMPLE + 1), (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
+  memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
   /* Main part of "simple" table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
     table[i] = (JSAMPLE)i;
@@ -426,10 +426,10 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
   for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
     table[i] = MAXJSAMPLE;
   /* Second half of post-IDCT table */
-  MEMZERO(table + (2 * (MAXJSAMPLE + 1)),
-          (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
-  MEMCOPY(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
-          cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+  memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+         (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+  memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+         cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
 }
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jdphuff.c b/3rdparty/libjpeg-turbo/src/jdphuff.c
index c6d82ca14b..9680ebcbd0 100644
--- a/3rdparty/libjpeg-turbo/src/jdphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2018-2021, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -578,9 +578,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
             if (GET_BITS(1)) {
               if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
                 if (*thiscoef >= 0)
-                  *thiscoef += p1;
+                  *thiscoef += (JCOEF)p1;
                 else
-                  *thiscoef += m1;
+                  *thiscoef += (JCOEF)m1;
               }
             }
           } else {
@@ -612,9 +612,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           if (GET_BITS(1)) {
             if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
               if (*thiscoef >= 0)
-                *thiscoef += p1;
+                *thiscoef += (JCOEF)p1;
               else
-                *thiscoef += m1;
+                *thiscoef += (JCOEF)m1;
             }
           }
         }
diff --git a/3rdparty/libjpeg-turbo/src/jerror.c b/3rdparty/libjpeg-turbo/src/jerror.c
index 936c4f5d80..d544702937 100644
--- a/3rdparty/libjpeg-turbo/src/jerror.c
+++ b/3rdparty/libjpeg-turbo/src/jerror.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -189,13 +189,13 @@ format_message(j_common_ptr cinfo, char *buffer)
 
   /* Format the message into the passed buffer */
   if (isstring)
-    sprintf(buffer, msgtext, err->msg_parm.s);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
   else
-    sprintf(buffer, msgtext,
-            err->msg_parm.i[0], err->msg_parm.i[1],
-            err->msg_parm.i[2], err->msg_parm.i[3],
-            err->msg_parm.i[4], err->msg_parm.i[5],
-            err->msg_parm.i[6], err->msg_parm.i[7]);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
+             err->msg_parm.i[0], err->msg_parm.i[1],
+             err->msg_parm.i[2], err->msg_parm.i[3],
+             err->msg_parm.i[4], err->msg_parm.i[5],
+             err->msg_parm.i[6], err->msg_parm.i[7]);
 }
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jerror.h b/3rdparty/libjpeg-turbo/src/jerror.h
index 4476df2c93..eb44a1140a 100644
--- a/3rdparty/libjpeg-turbo/src/jerror.h
+++ b/3rdparty/libjpeg-turbo/src/jerror.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, 2017, D. R. Commander.
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -103,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
          "Cannot transcode due to multiple use of quantization table %d")
 JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
-JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
@@ -268,6 +268,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
 #define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
    (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 
 #define MAKESTMT(stuff)         do { stuff } while (0)
@@ -324,6 +325,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
 #define TRACEMSS(cinfo, lvl, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
    (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
 
 #endif /* JERROR_H */
diff --git a/3rdparty/libjpeg-turbo/src/jinclude.h b/3rdparty/libjpeg-turbo/src/jinclude.h
index c1bcf7d9da..120614b25c 100644
--- a/3rdparty/libjpeg-turbo/src/jinclude.h
+++ b/3rdparty/libjpeg-turbo/src/jinclude.h
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1994, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,72 +17,117 @@
  * JPEG library.  Most applications need only include jpeglib.h.
  */
 
+#ifndef __JINCLUDE_H__
+#define __JINCLUDE_H__
 
 /* Include auto-config file to find out which system include files we need. */
 
 #include "jconfig.h"            /* auto configuration options */
+#include "jconfigint.h"
 #define JCONFIG_INCLUDED        /* so that jpeglib.h doesn't do it again */
 
 /*
- * We need the NULL macro and size_t typedef.
- * On an ANSI-conforming system it is sufficient to include <stddef.h>.
- * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
- * pull in <sys/types.h> as well.
  * Note that the core JPEG library does not require <stdio.h>;
  * only the default error handler and data source/destination modules do.
  * But we must pull it in because of the references to FILE in jpeglib.h.
  * You can remove those references if you want to compile without <stdio.h>.
  */
 
-#ifdef HAVE_STDDEF_H
 #include <stddef.h>
-#endif
-
-#ifdef HAVE_STDLIB_H
 #include <stdlib.h>
-#endif
-
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
 #include <stdio.h>
+#include <string.h>
 
 /*
- * We need memory copying and zeroing functions, plus strncpy().
- * ANSI and System V implementations declare these in <string.h>.
- * BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
- * Some systems may declare memset and memcpy in <memory.h>.
- *
- * NOTE: we assume the size parameters to these functions are of type size_t.
- * Change the casts in these macros if not!
+ * These macros/inline functions facilitate using Microsoft's "safe string"
+ * functions with Visual Studio builds without the need to scatter #ifdefs
+ * throughout the code base.
  */
 
-#ifdef NEED_BSD_STRINGS
 
-#include <strings.h>
-#define MEMZERO(target, size) \
-  bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest, src, size) \
-  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+#ifndef NO_GETENV
 
-#else /* not BSD, assume ANSI/SysV string lib */
+#ifdef _MSC_VER
 
-#include <string.h>
-#define MEMZERO(target, size) \
-  memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest, src, size) \
-  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  size_t required_size;
 
-#endif
+  return (int)getenv_s(&required_size, buffer, buffer_size, name);
+}
 
-/*
- * The modules that use fread() and fwrite() always invoke them through
- * these macros.  On some systems you may need to twiddle the argument casts.
- * CAUTION: argument order is different from underlying functions!
+#else /* _MSC_VER */
+
+#include <errno.h>
+
+/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
+ * but other than parameter validation, it has no advantages over getenv().
+ */
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  char *env;
+
+  if (!buffer) {
+    if (buffer_size == 0)
+      return 0;
+    else
+      return (errno = EINVAL);
+  }
+  if (buffer_size == 0)
+    return (errno = EINVAL);
+  if (!name) {
+    *buffer = 0;
+    return 0;
+  }
+
+  env = getenv(name);
+  if (!env)
+  {
+    *buffer = 0;
+    return 0;
+  }
+
+  if (strlen(env) + 1 > buffer_size) {
+    *buffer = 0;
+    return ERANGE;
+  }
+
+  strncpy(buffer, env, buffer_size);
+
+  return 0;
+}
+
+#endif /* _MSC_VER */
+
+#endif /* NO_GETENV */
+
+
+#ifndef NO_PUTENV
+
+#ifdef _WIN32
+
+#define PUTENV_S(name, value)  _putenv_s(name, value)
+
+#else
+
+/* This provides a similar interface to the Microsoft _putenv_s() function, but
+ * other than parameter validation, it has no advantages over setenv().
  */
 
-#define JFREAD(file, buf, sizeofbuf) \
-  ((size_t)fread((void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
-#define JFWRITE(file, buf, sizeofbuf) \
-  ((size_t)fwrite((const void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
+static INLINE int PUTENV_S(const char *name, const char *value)
+{
+  if (!name || !value)
+    return (errno = EINVAL);
+
+  setenv(name, value, 1);
+
+  return errno;
+}
+
+#endif /* _WIN32 */
+
+#endif /* NO_PUTENV */
+
+
+#endif /* JINCLUDE_H */
diff --git a/3rdparty/libjpeg-turbo/src/jmemmgr.c b/3rdparty/libjpeg-turbo/src/jmemmgr.c
index 70b8ec0c49..8f5a4ab1c7 100644
--- a/3rdparty/libjpeg-turbo/src/jmemmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jmemmgr.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, 2021, D. R. Commander.
+ * Copyright (C) 2016, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -37,12 +37,6 @@
 #endif
 #include <limits.h>
 
-#ifndef NO_GETENV
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
-extern char *getenv(const char *name);
-#endif
-#endif
-
 
 LOCAL(size_t)
 round_up_pow2(size_t a, size_t b)
@@ -1162,12 +1156,16 @@ jinit_memory_mgr(j_common_ptr cinfo)
    */
 #ifndef NO_GETENV
   {
-    char *memenv;
+    char memenv[30] = { 0 };
 
-    if ((memenv = getenv("JPEGMEM")) != NULL) {
+    if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
       char ch = 'x';
 
+#ifdef _MSC_VER
+      if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
+#else
       if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
+#endif
         if (ch == 'm' || ch == 'M')
           max_to_use *= 1000L;
         mem->pub.max_memory_to_use = max_to_use * 1000L;
diff --git a/3rdparty/libjpeg-turbo/src/jmemnobs.c b/3rdparty/libjpeg-turbo/src/jmemnobs.c
index 089be8f500..cd6571ba1c 100644
--- a/3rdparty/libjpeg-turbo/src/jmemnobs.c
+++ b/3rdparty/libjpeg-turbo/src/jmemnobs.c
@@ -22,11 +22,6 @@
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc(size_t size);
-extern void free(void *ptr);
-#endif
-
 
 /*
  * Memory allocation and freeing are controlled by the regular library
diff --git a/3rdparty/libjpeg-turbo/src/jmorecfg.h b/3rdparty/libjpeg-turbo/src/jmorecfg.h
index fb3a9cf411..b33a991914 100644
--- a/3rdparty/libjpeg-turbo/src/jmorecfg.h
+++ b/3rdparty/libjpeg-turbo/src/jmorecfg.h
@@ -100,11 +100,7 @@ typedef unsigned char UINT8;
 
 /* UINT16 must hold at least the values 0..65535. */
 
-#ifdef HAVE_UNSIGNED_SHORT
 typedef unsigned short UINT16;
-#else /* not HAVE_UNSIGNED_SHORT */
-typedef unsigned int UINT16;
-#endif /* HAVE_UNSIGNED_SHORT */
 
 /* INT16 must hold at least the values -32768..32767. */
 
diff --git a/3rdparty/libjpeg-turbo/src/jpegint.h b/3rdparty/libjpeg-turbo/src/jpegint.h
index 8c8534793a..6af9e2a179 100644
--- a/3rdparty/libjpeg-turbo/src/jpegint.h
+++ b/3rdparty/libjpeg-turbo/src/jpegint.h
@@ -373,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
 /* Arithmetic coding probability estimation tables in jaricom.c */
 extern const JLONG jpeg_aritab[];
-
-/* Suppress undefined-structure complaints if necessary. */
-
-#ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef AM_MEMORY_MANAGER       /* only jmemmgr.c defines these */
-struct jvirt_sarray_control { long dummy; };
-struct jvirt_barray_control { long dummy; };
-#endif
-#endif /* INCOMPLETE_TYPES_BROKEN */
diff --git a/3rdparty/libjpeg-turbo/src/jstdhuff.c b/3rdparty/libjpeg-turbo/src/jstdhuff.c
index 036d6495a5..345b513d4d 100644
--- a/3rdparty/libjpeg-turbo/src/jstdhuff.c
+++ b/3rdparty/libjpeg-turbo/src/jstdhuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
+ * Copyright (C) 2013, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -29,7 +29,7 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
     return;
 
   /* Copy the number-of-symbols-of-each-code-length counts */
-  MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+  memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
 
   /* Validate the counts.  We do this here mainly so we can copy the right
    * number of symbols from the val[] array, without risking marching off
@@ -41,8 +41,9 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
   if (nsymbols < 1 || nsymbols > 256)
     ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
-  MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
-  MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
+  memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+  memset(&((*htblptr)->huffval[nsymbols]), 0,
+         (256 - nsymbols) * sizeof(UINT8));
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
   (*htblptr)->sent_table = FALSE;
diff --git a/3rdparty/libjpeg-turbo/src/jutils.c b/3rdparty/libjpeg-turbo/src/jutils.c
index 5c5bb17dc5..d86271624a 100644
--- a/3rdparty/libjpeg-turbo/src/jutils.c
+++ b/3rdparty/libjpeg-turbo/src/jutils.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code
- * relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -110,7 +110,7 @@ jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
   for (row = num_rows; row > 0; row--) {
     inptr = *input_array++;
     outptr = *output_array++;
-    MEMCOPY(outptr, inptr, count);
+    memcpy(outptr, inptr, count);
   }
 }
 
@@ -120,7 +120,7 @@ jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
                 JDIMENSION num_blocks)
 /* Copy a row of coefficient blocks from one place to another. */
 {
-  MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
+  memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
 }
 
 
@@ -129,5 +129,5 @@ jzero_far(void *target, size_t bytestozero)
 /* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
-  MEMZERO(target, bytestozero);
+  memset(target, 0, bytestozero);
 }
diff --git a/3rdparty/libjpeg-turbo/src/jversion.h.in b/3rdparty/libjpeg-turbo/src/jversion.h.in
new file mode 100644
index 0000000000..dca4f08fdb
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jversion.h.in
@@ -0,0 +1,54 @@
+/*
+ * jversion.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2012-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains software version identification.
+ */
+
+
+#if JPEG_LIB_VERSION >= 80
+
+#define JVERSION        "8d  15-Jan-2012"
+
+#elif JPEG_LIB_VERSION >= 70
+
+#define JVERSION        "7  27-Jun-2009"
+
+#else
+
+#define JVERSION        "6b  27-Mar-1998"
+
+#endif
+
+/*
+ * NOTE: It is our convention to place the authors in the following order:
+ * - libjpeg-turbo authors (2009-) in descending order of the date of their
+ *   most recent contribution to the project, then in ascending order of the
+ *   date of their first contribution to the project, then in alphabetical
+ *   order
+ * - Upstream authors in descending order of the date of the first inclusion of
+ *   their code
+ */
+
+#define JCOPYRIGHT \
+  "Copyright (C) 2009-2022 D. R. Commander\n" \
+  "Copyright (C) 2015, 2020 Google, Inc.\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
+  "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
+  "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+  "Copyright (C) 2015 Intel Corporation\n" \
+  "Copyright (C) 2013-2014 Linaro Limited\n" \
+  "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+  "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
+  "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+  "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+
+#define JCOPYRIGHT_SHORT \
+  "Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others"
diff --git a/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
new file mode 100644
index 0000000000..8521e42b44
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
@@ -0,0 +1,540 @@
+macro(simd_fail message)
+  if(REQUIRE_SIMD)
+    message(FATAL_ERROR "${message}.")
+  else()
+    message(WARNING "${message}.  Performance will suffer.")
+    set(WITH_SIMD 0 PARENT_SCOPE)
+  endif()
+endmacro()
+
+
+###############################################################################
+# x86[-64] (NASM)
+###############################################################################
+
+if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+
+set(CMAKE_ASM_NASM_FLAGS_DEBUG_INIT "-g")
+set(CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO_INIT "-g")
+
+# Allow the location of the NASM executable to be specified using the ASM_NASM
+# environment variable.  This should happen automatically, but unfortunately
+# enable_language(ASM_NASM) doesn't parse the ASM_NASM environment variable
+# until after CMAKE_ASM_NASM_COMPILER has been populated with the results of
+# searching for NASM or Yasm in the PATH.
+if(NOT DEFINED CMAKE_ASM_NASM_COMPILER AND DEFINED ENV{ASM_NASM})
+  set(CMAKE_ASM_NASM_COMPILER $ENV{ASM_NASM})
+endif()
+
+if(CPU_TYPE STREQUAL "x86_64")
+  if(CYGWIN)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
+  endif()
+  if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
+  endif()
+elseif(CPU_TYPE STREQUAL "i386")
+  if(BORLAND)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
+  elseif(CYGWIN)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT win32)
+  endif()
+endif()
+
+if(NOT REQUIRE_SIMD)
+  include(CheckLanguage)
+  check_language(ASM_NASM)
+  if(NOT CMAKE_ASM_NASM_COMPILER)
+    simd_fail("SIMD extensions disabled: could not find NASM compiler")
+    return()
+  endif()
+endif()
+enable_language(ASM_NASM)
+message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
+
+if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^macho")
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DMACHO")
+elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^elf")
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DELF")
+  set(CMAKE_ASM_NASM_DEBUG_FORMAT "dwarf2")
+endif()
+if(CPU_TYPE STREQUAL "x86_64")
+  if(WIN32 OR CYGWIN)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN64")
+  endif()
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D__x86_64__")
+elseif(CPU_TYPE STREQUAL "i386")
+  if(BORLAND)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DOBJ32")
+  elseif(WIN32 OR CYGWIN)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN32")
+  endif()
+endif()
+
+message(STATUS "CMAKE_ASM_NASM_OBJECT_FORMAT = ${CMAKE_ASM_NASM_OBJECT_FORMAT}")
+
+if(NOT CMAKE_ASM_NASM_OBJECT_FORMAT)
+  simd_fail("SIMD extensions disabled: could not determine NASM object format")
+  return()
+endif()
+
+get_filename_component(CMAKE_ASM_NASM_COMPILER_TYPE
+  "${CMAKE_ASM_NASM_COMPILER}" NAME_WE)
+if(CMAKE_ASM_NASM_COMPILER_TYPE MATCHES "yasm")
+  foreach(var CMAKE_ASM_NASM_FLAGS_DEBUG CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO)
+    if(${var} STREQUAL "-g")
+      if(CMAKE_ASM_NASM_DEBUG_FORMAT)
+        set_property(CACHE ${var} PROPERTY VALUE "-g ${CMAKE_ASM_NASM_DEBUG_FORMAT}")
+      else()
+        set_property(CACHE ${var} PROPERTY VALUE "")
+      endif()
+    endif()
+  endforeach()
+endif()
+
+if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC")
+endif()
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+set(EFFECTIVE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} ${CMAKE_ASM_NASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_NASM_FLAGS = ${EFFECTIVE_ASM_NASM_FLAGS}")
+
+set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/nasm/\" -I\"${CMAKE_CURRENT_SOURCE_DIR}/${CPU_TYPE}/\"")
+
+set(GREP grep)
+if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  set(GREP ggrep)
+endif()
+add_custom_target(jsimdcfg COMMAND
+  ${CMAKE_C_COMPILER} -E -I${CMAKE_BINARY_DIR} -I${CMAKE_CURRENT_BINARY_DIR}
+    -I${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc.h |
+  ${GREP} -E '^[\;%]|^\ %' | sed 's%_cpp_protection_%%' |
+  sed 's@% define@%define@g' >${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc)
+
+if(CPU_TYPE STREQUAL "x86_64")
+  set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
+    x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
+    x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm
+    x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm
+    x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm
+    x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm
+    x86_64/jquanti-sse2.asm
+    x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
+    x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
+    x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
+else()
+  set(SIMD_SOURCES i386/jsimdcpu.asm i386/jfdctflt-3dn.asm
+    i386/jidctflt-3dn.asm i386/jquant-3dn.asm
+    i386/jccolor-mmx.asm i386/jcgray-mmx.asm i386/jcsample-mmx.asm
+    i386/jdcolor-mmx.asm i386/jdmerge-mmx.asm i386/jdsample-mmx.asm
+    i386/jfdctfst-mmx.asm i386/jfdctint-mmx.asm i386/jidctfst-mmx.asm
+    i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
+    i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
+    i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
+    i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm
+    i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm
+    i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm
+    i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm
+    i386/jquanti-sse2.asm
+    i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
+    i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
+    i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)
+endif()
+
+if(MSVC_IDE)
+  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
+  string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
+elseif(XCODE)
+  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}")
+  string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
+endif()
+
+file(GLOB INC_FILES nasm/*.inc)
+
+foreach(file ${SIMD_SOURCES})
+  set(OBJECT_DEPENDS "")
+  if(${file} MATCHES jccolor)
+    string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
+  endif()
+  if(${file} MATCHES jcgray)
+    string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
+  endif()
+  if(${file} MATCHES jdcolor)
+    string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
+  endif()
+  if(${file} MATCHES jdmerge)
+    string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
+  endif()
+  set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES})
+  if(MSVC_IDE OR XCODE)
+    # The CMake Visual Studio generators do not work properly with the ASM_NASM
+    # language, so we have to go rogue here and use a custom command like we
+    # did in prior versions of libjpeg-turbo.  (This is why we can't have nice
+    # things.)
+    string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file})
+    set(SIMD_OBJ ${OBJDIR}/${filename}${CMAKE_C_OUTPUT_EXTENSION})
+    add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS}
+      COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT}
+        ${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}
+        -o${SIMD_OBJ})
+    set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
+  else()
+    set_source_files_properties(${file} PROPERTIES OBJECT_DEPENDS
+      "${OBJECT_DEPENDS}")
+  endif()
+endforeach()
+
+if(MSVC_IDE OR XCODE)
+  set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
+  add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
+  add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
+  add_dependencies(simd simd-objs)
+else()
+  add_library(simd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c)
+endif()
+if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# Arm (Intrinsics or GAS)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
+
+# If Neon instructions are not explicitly enabled at compile time (e.g. using
+# -mfpu=neon) with an AArch32 Linux or Android build, then the AArch32 SIMD
+# dispatcher will parse /proc/cpuinfo to determine whether the Neon SIMD
+# extensions can be enabled at run time.  In order to support all AArch32 CPUs
+# using the same code base, i.e. to support run-time FPU and Neon
+# auto-detection, it is necessary to compile the scalar C source code using
+# -mfloat-abi=soft (which is usually the default) but compile the intrinsics
+# implementation of the Neon SIMD extensions using -mfloat-abi=softfp.  The
+# following test determines whether -mfloat-abi=softfp should be explicitly
+# added to the compile flags for the intrinsics implementation of the Neon SIMD
+# extensions.
+if(BITS EQUAL 32)
+  check_c_source_compiles("
+    #if defined(__ARM_NEON__) || (!defined(__linux__) && !defined(ANDROID) && !defined(__ANDROID__))
+    #error \"Neon run-time auto-detection will not be used\"
+    #endif
+    #if __ARM_PCS_VFP == 1
+    #error \"float ABI = hard\"
+    #endif
+    #if __SOFTFP__ != 1
+    #error \"float ABI = softfp\"
+    #endif
+    int main(void) { return 0; }" NEED_SOFTFP_FOR_INTRINSICS)
+  if(NEED_SOFTFP_FOR_INTRINSICS)
+    set(SOFTFP_FLAG -mfloat-abi=softfp)
+  endif()
+endif()
+
+if(BITS EQUAL 32)
+  set(CMAKE_REQUIRED_FLAGS "-mfpu=neon ${SOFTFP_FLAG}")
+  check_c_source_compiles("
+    #include <arm_neon.h>
+    int main(int argc, char **argv) {
+      uint16x8_t input = vdupq_n_u16((uint16_t)argc);
+      uint8x8_t output = vmovn_u16(input);
+      return (int)output[0];
+    }" HAVE_NEON)
+  if(NOT HAVE_NEON)
+    simd_fail("SIMD extensions not available for this architecture")
+    return()
+  endif()
+endif()
+check_c_source_compiles("
+  #include <arm_neon.h>
+  int main(int argc, char **argv) {
+    int16_t input[] = {
+      (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc,
+      (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc,
+      (int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc
+    };
+    int16x4x3_t output = vld1_s16_x3(input);
+    vst3_s16(input, output);
+    return (int)input[0];
+  }" HAVE_VLD1_S16_X3)
+check_c_source_compiles("
+  #include <arm_neon.h>
+  int main(int argc, char **argv) {
+    uint16_t input[] = {
+      (uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc,
+      (uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc
+    };
+    uint16x4x2_t output = vld1_u16_x2(input);
+    vst2_u16(input, output);
+    return (int)input[0];
+  }" HAVE_VLD1_U16_X2)
+check_c_source_compiles("
+  #include <arm_neon.h>
+  int main(int argc, char **argv) {
+    uint8_t input[] = {
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
+      (uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc
+    };
+    uint8x16x4_t output = vld1q_u8_x4(input);
+    vst4q_u8(input, output);
+    return (int)input[0];
+  }" HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+  unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
+
+# GCC 11 and earlier and some older versions of Clang do not have a full or
+# optimal set of Neon intrinsics, so for performance reasons, when using those
+# compilers, we default to using the older GAS implementation of the Neon SIMD
+# extensions for certain algorithms.  The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this, except with GCC 10
+# and 11.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4 AND
+  (NOT CMAKE_COMPILER_IS_GNUCC OR
+  CMAKE_C_COMPILER_VERSION VERSION_EQUAL 12.0.0 OR
+  CMAKE_C_COMPILER_VERSION VERSION_GREATER 12.0.0)))
+  set(DEFAULT_NEON_INTRINSICS 1)
+else()
+  set(DEFAULT_NEON_INTRINSICS 0)
+endif()
+option(NEON_INTRINSICS
+  "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+  ${DEFAULT_NEON_INTRINSICS})
+if(NOT NEON_INTRINSICS)
+  enable_language(ASM)
+
+  set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
+
+  # Test whether gas-preprocessor.pl would be needed to build the GAS
+  # implementation of the Neon SIMD extensions.  If so, then automatically
+  # enable the full Neon intrinsics implementation.
+  if(CPU_TYPE STREQUAL "arm")
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
+      .text
+      .fpu neon
+      .arch armv7a
+      .object_arch armv4
+      .arm
+      pld [r0]
+      vmovn.u16 d0, q0")
+  else()
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
+      .text
+      MYVAR .req x0
+      movi v0.16b, #100
+      mov MYVAR, #100
+      .unreq MYVAR")
+  endif()
+  separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}")
+  execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP}
+      -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
+    RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
+  if(NOT RESULT EQUAL 0)
+    message(WARNING "GAS appears to be broken.  Using the full Neon SIMD intrinsics implementation.")
+    set(NEON_INTRINSICS 1 CACHE INTERNAL "" FORCE)
+  endif()
+endif()
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+  add_definitions(-DNEON_INTRINSICS)
+  message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+else()
+  message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+endif()
+
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+  arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
+  arm/jidctred-neon.c arm/jquanti-neon.c)
+if(NEON_INTRINSICS)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 64)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 32)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
+    arm/jdcolor-neon.c arm/jfdctint-neon.c)
+endif()
+if(BITS EQUAL 32)
+  set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS "-mfpu=neon ${SOFTFP_FLAG}")
+endif()
+if(NOT NEON_INTRINSICS)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+  set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+  message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
+endif()
+
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# MIPS (GAS)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "mips" OR CPU_TYPE STREQUAL "mipsel")
+
+enable_language(ASM)
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+set(CMAKE_REQUIRED_FLAGS -mdspr2)
+
+check_c_source_compiles("
+  #if !(defined(__mips__) && __mips_isa_rev >= 2)
+  #error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
+  #endif
+  int main(void) {
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+      \"precr.qb.ph %[c], %[a], %[b]\"
+      : [c] \"=r\" (c)
+      : [a] \"r\" (a), [b] \"r\" (b)
+    );
+    return c;
+  }" HAVE_DSPR2)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_DSPR2)
+  simd_fail("SIMD extensions not available for this CPU")
+  return()
+endif()
+
+add_library(simd OBJECT mips/jsimd_dspr2.S mips/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+###############################################################################
+# MIPS64 (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "^mips64")
+
+set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
+
+check_c_source_compiles("
+  int main(void) {
+    int c = 0, a = 0, b = 0;
+    asm (
+      \"paddb %0, %1, %2\"
+      : \"=f\" (c)
+      : \"f\" (a), \"f\" (b)
+    );
+    return c;
+  }" HAVE_MMI)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_MMI)
+  simd_fail("SIMD extensions not available for this CPU")
+  return()
+endif()
+
+set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
+  mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
+  mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
+  mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
+
+if(CMAKE_COMPILER_IS_GNUCC)
+  foreach(file ${SIMD_SOURCES})
+    set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+      " -fno-strict-aliasing")
+  endforeach()
+endif()
+foreach(file ${SIMD_SOURCES})
+  set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wa,-mloongson-mmi,-mloongson-ext")
+endforeach()
+
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+###############################################################################
+# PowerPC (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "powerpc")
+
+set(CMAKE_REQUIRED_FLAGS -maltivec)
+
+check_c_source_compiles("
+  #include <altivec.h>
+  int main(void) {
+    __vector int vi = { 0, 0, 0, 0 };
+    int i[4];
+    vec_st(vi, 0, i);
+    return i[0];
+  }" HAVE_ALTIVEC)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_ALTIVEC)
+  simd_fail("SIMD extensions not available for this CPU (PowerPC SPE)")
+  return()
+endif()
+
+set(SIMD_SOURCES powerpc/jccolor-altivec.c powerpc/jcgray-altivec.c
+  powerpc/jcsample-altivec.c powerpc/jdcolor-altivec.c
+  powerpc/jdmerge-altivec.c powerpc/jdsample-altivec.c
+  powerpc/jfdctfst-altivec.c powerpc/jfdctint-altivec.c
+  powerpc/jidctfst-altivec.c powerpc/jidctint-altivec.c
+  powerpc/jquanti-altivec.c)
+
+set_source_files_properties(${SIMD_SOURCES} PROPERTIES
+  COMPILE_FLAGS -maltivec)
+
+add_library(simd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# None
+###############################################################################
+
+else()
+
+simd_fail("SIMD extensions not available for this CPU (${CMAKE_SYSTEM_PROCESSOR})")
+
+endif() # CPU_TYPE
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..e3adf23d50
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,980 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use Neon regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+    /* Store nbits. */
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..604d5472f6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1058 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "jconfigint.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#define JSIMD_FASTLD3  1
+#define JSIMD_FASTST3  2
+#define JSIMD_FASTTBL  4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+  char *p;
+
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+  simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTST3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+  if (simd_features & JSIMD_FASTTBL)
+#endif
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+#ifndef NEON_INTRINSICS
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S
new file mode 100644
index 0000000000..738a4f0658
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S
@@ -0,0 +1,2254 @@
+/*
+ * Armv8 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
+.text
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
+
+asm_function jsimd_idct_islow_neon
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x1
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
+
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+    sub             sp, sp, #64
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
+    ld1             {v0.8h, v1.8h}, [x15]
+    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
+
+    cmeq            v16.8h, v3.8h, #0
+    cmeq            v26.8h, v4.8h, #0
+    cmeq            v27.8h, v5.8h, #0
+    cmeq            v28.8h, v6.8h, #0
+    cmeq            v29.8h, v7.8h, #0
+    cmeq            v30.8h, v8.8h, #0
+    cmeq            v31.8h, v9.8h, #0
+
+    and             v10.16b, v16.16b, v26.16b
+    and             v11.16b, v27.16b, v28.16b
+    and             v12.16b, v29.16b, v30.16b
+    and             v13.16b, v31.16b, v10.16b
+    and             v14.16b, v11.16b, v12.16b
+    mul             v2.8h, v2.8h, v18.8h
+    and             v15.16b, v13.16b, v14.16b
+    shl             v10.8h, v2.8h, #(PASS1_BITS)
+    sqxtn           v16.8b, v15.8h
+    mov             TMP1, v16.d[0]
+    mvn             TMP2, TMP1
+
+    cbnz            TMP2, 2f
+    /* case all AC coeffs are zeros */
+    dup             v2.2d, v10.d[0]
+    dup             v6.2d, v10.d[1]
+    mov             v3.16b, v2.16b
+    mov             v7.16b, v6.16b
+    mov             v4.16b, v2.16b
+    mov             v8.16b, v6.16b
+    mov             v5.16b, v2.16b
+    mov             v9.16b, v6.16b
+1:
+    /* for this transpose, we should organise data like this:
+     * 00, 01, 02, 03, 40, 41, 42, 43
+     * 10, 11, 12, 13, 50, 51, 52, 53
+     * 20, 21, 22, 23, 60, 61, 62, 63
+     * 30, 31, 32, 33, 70, 71, 72, 73
+     * 04, 05, 06, 07, 44, 45, 46, 47
+     * 14, 15, 16, 17, 54, 55, 56, 57
+     * 24, 25, 26, 27, 64, 65, 66, 67
+     * 34, 35, 36, 37, 74, 75, 76, 77
+     */
+    trn1            v28.8h, v2.8h, v3.8h
+    trn1            v29.8h, v4.8h, v5.8h
+    trn1            v30.8h, v6.8h, v7.8h
+    trn1            v31.8h, v8.8h, v9.8h
+    trn2            v16.8h, v2.8h, v3.8h
+    trn2            v17.8h, v4.8h, v5.8h
+    trn2            v18.8h, v6.8h, v7.8h
+    trn2            v19.8h, v8.8h, v9.8h
+    trn1            v2.4s, v28.4s, v29.4s
+    trn1            v6.4s, v30.4s, v31.4s
+    trn1            v3.4s, v16.4s, v17.4s
+    trn1            v7.4s, v18.4s, v19.4s
+    trn2            v4.4s, v28.4s, v29.4s
+    trn2            v8.4s, v30.4s, v31.4s
+    trn2            v5.4s, v16.4s, v17.4s
+    trn2            v9.4s, v18.4s, v19.4s
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    movi            v0.16b, #(CENTERJSAMPLE)
+    /* Prepare pointers (dual-issue with Neon instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP1, TMP1, OUTPUT_COL
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP2, TMP2, OUTPUT_COL
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP3, TMP3, OUTPUT_COL
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP4, TMP4, OUTPUT_COL
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
+    /* Transpose the final 8-bit samples */
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
+    /* Store results to the output buffer */
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    blr             x30
+
+.balign 16
+2:
+    mul             v3.8h, v3.8h, v19.8h
+    mul             v4.8h, v4.8h, v20.8h
+    mul             v5.8h, v5.8h, v21.8h
+    add             TMP4, xzr, TMP2, LSL #32
+    mul             v6.8h, v6.8h, v22.8h
+    mul             v7.8h, v7.8h, v23.8h
+    adds            TMP3, xzr, TMP2, LSR #32
+    mul             v8.8h, v8.8h, v24.8h
+    mul             v9.8h, v9.8h, v25.8h
+    b.ne            3f
+    /* Right AC coef is zero */
+    dup             v15.2d, v10.d[1]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    mov             v6.16b, v15.16b
+    mov             v7.16b, v15.16b
+    mov             v8.16b, v15.16b
+    mov             v9.16b, v15.16b
+    b               1b
+
+.balign 16
+3:
+    cbnz            TMP4, 4f
+    /* Left AC coef is zero */
+    dup             v14.2d, v10.d[0]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    mov             v2.16b, v14.16b
+    mov             v3.16b, v14.16b
+    mov             v4.16b, v14.16b
+    mov             v5.16b, v14.16b
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+.balign 16
+4:
+    /* "No" AC coef is zero */
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
+
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro do_load size
+  .if \size == 8
+    ld1             {v4.8b}, [U], 8
+    ld1             {v5.8b}, [V], 8
+    ld1             {v0.8b}, [Y], 8
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+  .elseif \size == 4
+    ld1             {v4.b}[0], [U], 1
+    ld1             {v4.b}[1], [U], 1
+    ld1             {v4.b}[2], [U], 1
+    ld1             {v4.b}[3], [U], 1
+    ld1             {v5.b}[0], [V], 1
+    ld1             {v5.b}[1], [V], 1
+    ld1             {v5.b}[2], [V], 1
+    ld1             {v5.b}[3], [V], 1
+    ld1             {v0.b}[0], [Y], 1
+    ld1             {v0.b}[1], [Y], 1
+    ld1             {v0.b}[2], [Y], 1
+    ld1             {v0.b}[3], [Y], 1
+  .elseif \size == 2
+    ld1             {v4.b}[4], [U], 1
+    ld1             {v4.b}[5], [U], 1
+    ld1             {v5.b}[4], [V], 1
+    ld1             {v5.b}[5], [V], 1
+    ld1             {v0.b}[4], [Y], 1
+    ld1             {v0.b}[5], [Y], 1
+  .elseif \size == 1
+    ld1             {v4.b}[6], [U], 1
+    ld1             {v5.b}[6], [V], 1
+    ld1             {v0.b}[6], [Y], 1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_st3 == 1
+        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      .else
+        st1         {v10.b}[0], [RGB], #1
+        st1         {v11.b}[0], [RGB], #1
+        st1         {v12.b}[0], [RGB], #1
+
+        st1         {v10.b}[1], [RGB], #1
+        st1         {v11.b}[1], [RGB], #1
+        st1         {v12.b}[1], [RGB], #1
+
+        st1         {v10.b}[2], [RGB], #1
+        st1         {v11.b}[2], [RGB], #1
+        st1         {v12.b}[2], [RGB], #1
+
+        st1         {v10.b}[3], [RGB], #1
+        st1         {v11.b}[3], [RGB], #1
+        st1         {v12.b}[3], [RGB], #1
+
+        st1         {v10.b}[4], [RGB], #1
+        st1         {v11.b}[4], [RGB], #1
+        st1         {v12.b}[4], [RGB], #1
+
+        st1         {v10.b}[5], [RGB], #1
+        st1         {v11.b}[5], [RGB], #1
+        st1         {v12.b}[5], [RGB], #1
+
+        st1         {v10.b}[6], [RGB], #1
+        st1         {v11.b}[6], [RGB], #1
+        st1         {v12.b}[6], [RGB], #1
+
+        st1         {v10.b}[7], [RGB], #1
+        st1         {v11.b}[7], [RGB], #1
+        st1         {v12.b}[7], [RGB], #1
+      .endif
+    .elseif \size == 4
+      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
+    .elseif \size == 2
+      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
+    .elseif \size == 1
+      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
+    .else
+     .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+    .elseif \size == 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+    .elseif \size == 2
+      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+    .elseif \size == 1
+      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 16
+    .if \size == 8
+      st1           {v25.8h}, [RGB], 16
+    .elseif \size == 4
+      st1           {v25.4h}, [RGB], 8
+    .elseif \size == 2
+      st1           {v25.h}[4], [RGB], 2
+      st1           {v25.h}[5], [RGB], 2
+    .elseif \size == 1
+      st1           {v25.h}[6], [RGB], 2
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+                                           g_offs, gsize, b_offs, bsize, \
+                                           defsize, fast_st3
+
+/*
+ * 2-stage pipelined YCbCr->RGB conversion
+ */
+
+.macro do_yuv_to_rgb_stage1
+    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb_stage2
+    rshrn           v20.4h, v20.4s, #15
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16
+    sqxtun          v1\g_offs\defsize, v20.8h
+    sqxtun          v1\r_offs\defsize, v24.8h
+    sqxtun          v1\b_offs\defsize, v28.8h
+  .else
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    sri             v25.8h, v21.8h, #5
+    sri             v25.8h, v29.8h, #11
+  .endif
+.endm
+
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+    rshrn           v20.4h, v20.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    ld1             {v4.8b}, [U], 8
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    ld1             {v5.8b}, [V], 8
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+    sqxtun          v1\g_offs\defsize, v20.8h
+    ld1             {v0.8b}, [Y], 8
+    sqxtun          v1\r_offs\defsize, v24.8h
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sqxtun          v1\b_offs\defsize, v28.8h
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+  .else  /**************************** rgb565 ********************************/
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1             {v0.8b}, [Y], 8
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    sri             v25.8h, v21.8h, #5
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sri             v25.8h, v29.8h, #11
+  .endif
+    do_store        \bpp, 8, \fast_st3
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb
+    do_yuv_to_rgb_stage1
+    do_yuv_to_rgb_stage2
+.endm
+
+.if \fast_st3 == 1
+asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    INPUT_ROW       .req w2
+    OUTPUT_BUF      .req x3
+    NUM_ROWS        .req w4
+
+    INPUT_BUF0      .req x5
+    INPUT_BUF1      .req x6
+    INPUT_BUF2      .req x1
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w15
+
+    sub             sp, sp, 64
+    mov             x9, sp
+
+    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
+
+    /* Save Neon registers */
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+    ld1             {v0.4h, v1.4h}, [x15], 16
+    ld1             {v2.8h}, [x15]
+
+    ldr             INPUT_BUF0, [INPUT_BUF]
+    ldr             INPUT_BUF1, [INPUT_BUF, #8]
+    ldr             INPUT_BUF2, [INPUT_BUF, #16]
+    .unreq          INPUT_BUF
+
+    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
+    movi            v10.16b, #255
+    movi            v13.16b, #255
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
+    add             INPUT_ROW, INPUT_ROW, #1
+    ldr             RGB, [OUTPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         8
+    do_yuv_to_rgb_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_yuv_to_rgb_stage2
+    do_store        \bpp, 8, \fast_st3
+    tst             N, #7
+    b.eq            8f
+3:
+    tst             N, #4
+    b.eq            3f
+    do_load         4
+3:
+    tst             N, #2
+    b.eq            4f
+    do_load         2
+4:
+    tst             N, #1
+    b.eq            5f
+    do_load         1
+5:
+    do_yuv_to_rgb
+    tst             N, #4
+    b.eq            6f
+    do_store        \bpp, 4, \fast_st3
+6:
+    tst             N, #2
+    b.eq            7f
+    do_store        \bpp, 2, \fast_st3
+7:
+    tst             N, #1
+    b.eq            8f
+    do_store        \bpp, 1, \fast_st3
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+    .unreq          OUTPUT_WIDTH
+    .unreq          INPUT_ROW
+    .unreq          OUTPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          INPUT_BUF0
+    .unreq          INPUT_BUF1
+    .unreq          INPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_yuv_to_rgb
+.purgem do_yuv_to_rgb_stage1
+.purgem do_yuv_to_rgb_stage2
+.purgem do_yuv_to_rgb_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+  .elseif \size == 4
+    st1             {v20.b}[0], [Y], #1
+    st1             {v20.b}[1], [Y], #1
+    st1             {v20.b}[2], [Y], #1
+    st1             {v20.b}[3], [Y], #1
+    st1             {v21.b}[0], [U], #1
+    st1             {v21.b}[1], [U], #1
+    st1             {v21.b}[2], [U], #1
+    st1             {v21.b}[3], [U], #1
+    st1             {v22.b}[0], [V], #1
+    st1             {v22.b}[1], [V], #1
+    st1             {v22.b}[2], [V], #1
+    st1             {v22.b}[3], [V], #1
+  .elseif \size == 2
+    st1             {v20.b}[4], [Y], #1
+    st1             {v20.b}[5], [Y], #1
+    st1             {v21.b}[4], [U], #1
+    st1             {v21.b}[5], [U], #1
+    st1             {v22.b}[4], [V], #1
+    st1             {v22.b}[5], [V], #1
+  .elseif \size == 1
+    st1             {v20.b}[6], [Y], #1
+    st1             {v21.b}[6], [U], #1
+    st1             {v22.b}[6], [V], #1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_ld3 == 1
+        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      .else
+        ld1         {v10.b}[0], [RGB], #1
+        ld1         {v11.b}[0], [RGB], #1
+        ld1         {v12.b}[0], [RGB], #1
+
+        ld1         {v10.b}[1], [RGB], #1
+        ld1         {v11.b}[1], [RGB], #1
+        ld1         {v12.b}[1], [RGB], #1
+
+        ld1         {v10.b}[2], [RGB], #1
+        ld1         {v11.b}[2], [RGB], #1
+        ld1         {v12.b}[2], [RGB], #1
+
+        ld1         {v10.b}[3], [RGB], #1
+        ld1         {v11.b}[3], [RGB], #1
+        ld1         {v12.b}[3], [RGB], #1
+
+        ld1         {v10.b}[4], [RGB], #1
+        ld1         {v11.b}[4], [RGB], #1
+        ld1         {v12.b}[4], [RGB], #1
+
+        ld1         {v10.b}[5], [RGB], #1
+        ld1         {v11.b}[5], [RGB], #1
+        ld1         {v12.b}[5], [RGB], #1
+
+        ld1         {v10.b}[6], [RGB], #1
+        ld1         {v11.b}[6], [RGB], #1
+        ld1         {v12.b}[6], [RGB], #1
+
+        ld1         {v10.b}[7], [RGB], #1
+        ld1         {v11.b}[7], [RGB], #1
+        ld1         {v12.b}[7], [RGB], #1
+      .endif
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
+    .elseif \size == 2
+      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
+    .elseif \size == 1
+      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+    .elseif \size == 2
+      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+    .elseif \size == 1
+      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+                                           b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
+    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
+    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
+    rev64           v18.4s, v1.4s
+    rev64           v26.4s, v1.4s
+    rev64           v28.4s, v1.4s
+    rev64           v30.4s, v1.4s
+    umull           v14.4s, v4.4h, v0.h[0]
+    umull2          v16.4s, v4.8h, v0.h[0]
+    umlsl           v18.4s, v4.4h, v0.h[3]
+    umlsl2          v26.4s, v4.8h, v0.h[3]
+    umlal           v28.4s, v4.4h, v0.h[5]
+    umlal2          v30.4s, v4.8h, v0.h[5]
+    umlal           v14.4s, v6.4h, v0.h[1]
+    umlal2          v16.4s, v6.8h, v0.h[1]
+    umlsl           v18.4s, v6.4h, v0.h[4]
+    umlsl2          v26.4s, v6.8h, v0.h[4]
+    umlsl           v28.4s, v6.4h, v0.h[6]
+    umlsl2          v30.4s, v6.8h, v0.h[6]
+    umlal           v14.4s, v8.4h, v0.h[2]
+    umlal2          v16.4s, v8.8h, v0.h[2]
+    umlal           v18.4s, v8.4h, v0.h[5]
+    umlal2          v26.4s, v8.8h, v0.h[5]
+    umlsl           v28.4s, v8.4h, v0.h[7]
+    umlsl2          v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    rshrn           v20.4h, v14.4s, #16
+    shrn            v22.4h, v18.4s, #16
+    shrn            v24.4h, v28.4s, #16
+    rshrn2          v20.8h, v16.4s, #16
+    shrn2           v22.8h, v26.4s, #16
+    shrn2           v24.8h, v30.4s, #16
+    xtn             v20.8b, v20.8h       /* v20 = y */
+    xtn             v21.8b, v22.8h       /* v21 = u */
+    xtn             v22.8b, v24.8h       /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+    do_rgb_to_yuv_stage2
+    do_load         \bpp, 8, \fast_ld3
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+    do_rgb_to_yuv_stage1
+.endm
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_ROW      .req w3
+    NUM_ROWS        .req w4
+
+    OUTPUT_BUF0     .req x5
+    OUTPUT_BUF1     .req x6
+    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w12
+
+    /* Load constants to d0, d1, d2, d3 */
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
+    ld1             {v0.8h, v1.8h}, [x13]
+
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    sub             sp, sp, #64
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         \bpp, 8, \fast_ld3
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    b.eq            8f
+3:
+    tbz             N, #2, 3f
+    do_load         \bpp, 4, \fast_ld3
+3:
+    tbz             N, #1, 4f
+    do_load         \bpp, 2, \fast_ld3
+4:
+    tbz             N, #0, 5f
+    do_load         \bpp, 1, \fast_ld3
+5:
+    do_rgb_to_yuv
+    tbz             N, #2, 6f
+    do_store        4
+6:
+    tbz             N, #1, 7f
+    do_store        2
+7:
+    tbz             N, #0, 8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
+    ld1             {v0.8h, v1.8h}, [TMP]
+
+    /* Save Neon registers */
+    sub             sp, sp, #64
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
+
+    /* Load all DATA into Neon registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v16.8h
+     *   1 | d18     | d19    | v17.8h
+     *   2 | d20     | d21    | v18.8h
+     *   3 | d22     | d23    | v19.8h
+     *   4 | d24     | d25    | v20.8h
+     *   5 | d26     | d27    | v21.8h
+     *   6 | d28     | d29    | v22.8h
+     *   7 | d30     | d31    | v23.8h
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub             DATA, DATA, #64
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P1
+    rshrn           v22.4h, v22.4s, #DESCALE_P1
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P1
+    rshrn           v21.4h, v29.4s, #DESCALE_P1
+    rshrn           v19.4h, v30.4s, #DESCALE_P1
+    rshrn           v17.4h, v31.4s, #DESCALE_P1
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P2
+    rshrn           v22.4h, v22.4s, #DESCALE_P2
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P2
+    rshrn           v21.4h, v29.4s, #DESCALE_P2
+    rshrn           v19.4h, v30.4s, #DESCALE_P2
+    rshrn           v17.4h, v31.4s, #DESCALE_P2
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    /* Restore Neon registers */
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+    BUFFER          .req x1
+    PUT_BUFFER      .req x6
+    PUT_BITS        .req x7
+    PUT_BITSw       .req w7
+
+.macro emit_byte
+    sub             PUT_BITS, PUT_BITS, #0x8
+    lsr             x19, PUT_BUFFER, PUT_BITS
+    uxtb            w19, w19
+    strb            w19, [BUFFER, #1]!
+    cmp             w19, #0xff
+    b.ne            14f
+    strb            wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
+    add             PUT_BITS, PUT_BITS, \SIZE
+    orr             PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+    cmp             PUT_BITS, #0x20
+    b.lt            31f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+31:
+.endm
+.macro checkbuf47
+    cmp             PUT_BITS, #0x30
+    b.lt            47f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+    sub             sp, sp, 272
+    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
+    /* Save Arm registers */
+    stp             x19, x20, [sp]
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
+    ldr             PUT_BUFFER, [x0, #0x10]
+    ldr             PUT_BITSw, [x0, #0x18]
+    ldrsh           w12, [x2]               /* load DC coeff in w12 */
+    /* prepare data */
+.if \fast_tbl == 1
+    ld1             {v23.16b}, [x15], #16
+    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
+    /* ZigZag 8x8 */
+    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+    ins             v0.h[0], w12
+    tbx             v1.16b, {v28.16b}, v16.16b
+    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
+    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
+    tbx             v6.16b, {v31.16b}, v19.16b
+.else
+      add             x13, x2, #0x22
+      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
+    ld1             {v23.16b}, [x15]
+      add             x14, x2, #0x18
+      add             x3, x2, #0x36
+    ins             v0.h[0], w12
+      add             x9, x2, #0x2
+    ld1             {v1.h}[0], [x13]
+      add             x15, x2, #0x30
+    ld1             {v2.h}[0], [x14]
+      add             x19, x2, #0x26
+    ld1             {v3.h}[0], [x3]
+      add             x20, x2, #0x28
+    ld1             {v0.h}[1], [x9]
+      add             x12, x2, #0x10
+    ld1             {v1.h}[1], [x15]
+      add             x13, x2, #0x40
+    ld1             {v2.h}[1], [x19]
+      add             x14, x2, #0x34
+    ld1             {v3.h}[1], [x20]
+      add             x3, x2, #0x1a
+    ld1             {v0.h}[2], [x12]
+      add             x9, x2, #0x20
+    ld1             {v1.h}[2], [x13]
+      add             x15, x2, #0x32
+    ld1             {v2.h}[2], [x14]
+      add             x19, x2, #0x42
+    ld1             {v3.h}[2], [x3]
+      add             x20, x2, #0xc
+    ld1             {v0.h}[3], [x9]
+      add             x12, x2, #0x12
+    ld1             {v1.h}[3], [x15]
+      add             x13, x2, #0x24
+    ld1             {v2.h}[3], [x19]
+      add             x14, x2, #0x50
+    ld1             {v3.h}[3], [x20]
+      add             x3, x2, #0xe
+    ld1             {v0.h}[4], [x12]
+      add             x9, x2, #0x4
+    ld1             {v1.h}[4], [x13]
+      add             x15, x2, #0x16
+    ld1             {v2.h}[4], [x14]
+      add             x19, x2, #0x60
+    ld1             {v3.h}[4], [x3]
+      add             x20, x2, #0x1c
+    ld1             {v0.h}[5], [x9]
+      add             x12, x2, #0x6
+    ld1             {v1.h}[5], [x15]
+      add             x13, x2, #0x8
+    ld1             {v2.h}[5], [x19]
+      add             x14, x2, #0x52
+    ld1             {v3.h}[5], [x20]
+      add             x3, x2, #0x2a
+    ld1             {v0.h}[6], [x12]
+      add             x9, x2, #0x14
+    ld1             {v1.h}[6], [x13]
+      add             x15, x2, #0xa
+    ld1             {v2.h}[6], [x14]
+      add             x19, x2, #0x44
+    ld1             {v3.h}[6], [x3]
+      add             x20, x2, #0x38
+    ld1             {v0.h}[7], [x9]
+      add             x12, x2, #0x46
+    ld1             {v1.h}[7], [x15]
+      add             x13, x2, #0x3a
+    ld1             {v2.h}[7], [x19]
+      add             x14, x2, #0x74
+    ld1             {v3.h}[7], [x20]
+      add             x3, x2, #0x6a
+    ld1             {v4.h}[0], [x12]
+      add             x9, x2, #0x54
+    ld1             {v5.h}[0], [x13]
+      add             x15, x2, #0x2c
+    ld1             {v6.h}[0], [x14]
+      add             x19, x2, #0x76
+    ld1             {v7.h}[0], [x3]
+      add             x20, x2, #0x78
+    ld1             {v4.h}[1], [x9]
+      add             x12, x2, #0x62
+    ld1             {v5.h}[1], [x15]
+      add             x13, x2, #0x1e
+    ld1             {v6.h}[1], [x19]
+      add             x14, x2, #0x68
+    ld1             {v7.h}[1], [x20]
+      add             x3, x2, #0x7a
+    ld1             {v4.h}[2], [x12]
+      add             x9, x2, #0x70
+    ld1             {v5.h}[2], [x13]
+      add             x15, x2, #0x2e
+    ld1             {v6.h}[2], [x14]
+      add             x19, x2, #0x5a
+    ld1             {v7.h}[2], [x3]
+      add             x20, x2, #0x6c
+    ld1             {v4.h}[3], [x9]
+      add             x12, x2, #0x72
+    ld1             {v5.h}[3], [x15]
+      add             x13, x2, #0x3c
+    ld1             {v6.h}[3], [x19]
+      add             x14, x2, #0x4c
+    ld1             {v7.h}[3], [x20]
+      add             x3, x2, #0x5e
+    ld1             {v4.h}[4], [x12]
+      add             x9, x2, #0x64
+    ld1             {v5.h}[4], [x13]
+      add             x15, x2, #0x4a
+    ld1             {v6.h}[4], [x14]
+      add             x19, x2, #0x3e
+    ld1             {v7.h}[4], [x3]
+      add             x20, x2, #0x6e
+    ld1             {v4.h}[5], [x9]
+      add             x12, x2, #0x56
+    ld1             {v5.h}[5], [x15]
+      add             x13, x2, #0x58
+    ld1             {v6.h}[5], [x19]
+      add             x14, x2, #0x4e
+    ld1             {v7.h}[5], [x20]
+      add             x3, x2, #0x7c
+    ld1             {v4.h}[6], [x12]
+      add             x9, x2, #0x48
+    ld1             {v5.h}[6], [x13]
+      add             x15, x2, #0x66
+    ld1             {v6.h}[6], [x14]
+      add             x19, x2, #0x5c
+    ld1             {v7.h}[6], [x3]
+      add             x20, x2, #0x7e
+    ld1             {v4.h}[7], [x9]
+    ld1             {v5.h}[7], [x15]
+    ld1             {v6.h}[7], [x19]
+    ld1             {v7.h}[7], [x20]
+.endif
+    cmlt            v24.8h, v0.8h, #0
+    cmlt            v25.8h, v1.8h, #0
+    cmlt            v26.8h, v2.8h, #0
+    cmlt            v27.8h, v3.8h, #0
+    cmlt            v28.8h, v4.8h, #0
+    cmlt            v29.8h, v5.8h, #0
+    cmlt            v30.8h, v6.8h, #0
+    cmlt            v31.8h, v7.8h, #0
+    abs             v0.8h, v0.8h
+    abs             v1.8h, v1.8h
+    abs             v2.8h, v2.8h
+    abs             v3.8h, v3.8h
+    abs             v4.8h, v4.8h
+    abs             v5.8h, v5.8h
+    abs             v6.8h, v6.8h
+    abs             v7.8h, v7.8h
+    eor             v24.16b, v24.16b, v0.16b
+    eor             v25.16b, v25.16b, v1.16b
+    eor             v26.16b, v26.16b, v2.16b
+    eor             v27.16b, v27.16b, v3.16b
+    eor             v28.16b, v28.16b, v4.16b
+    eor             v29.16b, v29.16b, v5.16b
+    eor             v30.16b, v30.16b, v6.16b
+    eor             v31.16b, v31.16b, v7.16b
+    cmeq            v16.8h, v0.8h, #0
+    cmeq            v17.8h, v1.8h, #0
+    cmeq            v18.8h, v2.8h, #0
+    cmeq            v19.8h, v3.8h, #0
+    cmeq            v20.8h, v4.8h, #0
+    cmeq            v21.8h, v5.8h, #0
+    cmeq            v22.8h, v6.8h, #0
+    xtn             v16.8b, v16.8h
+    xtn             v18.8b, v18.8h
+    xtn             v20.8b, v20.8h
+    xtn             v22.8b, v22.8h
+      umov            w14, v0.h[0]
+    xtn2            v16.16b, v17.8h
+      umov            w13, v24.h[0]
+    xtn2            v18.16b, v19.8h
+      clz             w14, w14
+    xtn2            v20.16b, v21.8h
+      lsl             w13, w13, w14
+    cmeq            v17.8h, v7.8h, #0
+      sub             w12, w14, #32
+    xtn2            v22.16b, v17.8h
+      lsr             w13, w13, w14
+    and             v16.16b, v16.16b, v23.16b
+      neg             w12, w12
+    and             v18.16b, v18.16b, v23.16b
+      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
+    and             v20.16b, v20.16b, v23.16b
+      add             x15, sp, #0x90           /* x15 = t2 */
+    and             v22.16b, v22.16b, v23.16b
+      ldr             w10, [x4, x12, lsl #2]
+    addp            v16.16b, v16.16b, v18.16b
+      ldrb            w11, [x3, x12]
+    addp            v20.16b, v20.16b, v22.16b
+      checkbuf47
+    addp            v16.16b, v16.16b, v20.16b
+      put_bits        x10, x11
+    addp            v16.16b, v16.16b, v18.16b
+      checkbuf47
+    umov            x9, v16.D[0]
+      put_bits        x13, x12
+    cnt             v17.8b, v16.8b
+      mvn             x9, x9
+    addv            B18, v17.8b
+      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
+    umov            w12, v18.b[0]
+      lsr             x9, x9, #0x1     /* clear AC coeff */
+    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
+    rbit            x9, x9             /* x9 = index0 */
+    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
+    cmp             w12, #(64-8)
+    add             x11, sp, #16
+    b.lt            4f
+    cbz             x9, 6f
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w20, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    clz             w20, w20
+    ldrh            w3, [x15, #2]!
+    sub             w11, w20, #32
+    lsl             w3, w3, w20
+    neg             w11, w11
+    lsr             w3, w3, w20
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+    b               6f
+4:
+    movi            v21.8h, #0x0010
+    clz             v0.8h, v0.8h
+    clz             v1.8h, v1.8h
+    clz             v2.8h, v2.8h
+    clz             v3.8h, v3.8h
+    clz             v4.8h, v4.8h
+    clz             v5.8h, v5.8h
+    clz             v6.8h, v6.8h
+    clz             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    neg             v0.8h, v0.8h
+    neg             v1.8h, v1.8h
+    neg             v2.8h, v2.8h
+    neg             v3.8h, v3.8h
+    neg             v4.8h, v4.8h
+    neg             v5.8h, v5.8h
+    neg             v6.8h, v6.8h
+    neg             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    add             v0.8h, v21.8h, v0.8h
+    add             v1.8h, v21.8h, v1.8h
+    add             v2.8h, v21.8h, v2.8h
+    add             v3.8h, v21.8h, v3.8h
+    add             v4.8h, v21.8h, v4.8h
+    add             v5.8h, v21.8h, v5.8h
+    add             v6.8h, v21.8h, v6.8h
+    add             v7.8h, v21.8h, v7.8h
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w11, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    ldrh            w3, [x15, #2]!
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+6:
+    add             x13, sp, #0x10e
+    cmp             x15, x13
+    b.hs            1f
+    ldr             w12, [x5]
+    ldrb            w14, [x4]
+    checkbuf47
+    put_bits        x12, x14
+1:
+    str             PUT_BUFFER, [x0, #0x10]
+    str             PUT_BITSw, [x0, #0x18]
+    ldp             x19, x20, [sp], 16
+    add             x0, BUFFER, #0x1
+    add             sp, sp, 256
+    br              x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+    .unreq          BUFFER
+    .unreq          PUT_BUFFER
+    .unreq          PUT_BITS
+    .unreq          PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/align.h b/3rdparty/libjpeg-turbo/src/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h b/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    SPLAT(); \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..b91c5db478
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
@@ -0,0 +1,622 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..ea4668f1d3
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
@@ -0,0 +1,142 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..e4f91fdc0e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..043b652e6c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in b/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in
new file mode 100644
index 0000000000..d403f2289f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [ebx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [edx], ymm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    add         ebx, byte SIZEOF_YMMWORD           ; outptr1
+    add         edx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                           int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
+    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
+    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    movq        mm7, mm1
+    movq        mm4, mm6
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        mm1, mm1
+    pxor        mm6, mm6
+    punpcklwd   mm1, mm5                ; mm1=BOL
+    punpckhwd   mm6, mm5                ; mm6=BOH
+    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
+
+    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm1
+    paddd       mm4, mm6
+    paddd       mm7, mm5
+    paddd       mm4, mm5
+    psrld       mm7, SCALEBITS          ; mm7=CbOL
+    psrld       mm4, SCALEBITS          ; mm4=CbOH
+    packssdw    mm7, mm4                ; mm7=CbO
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    movq        mm5, mm0
+    movq        mm4, mm6
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        mm0, mm0
+    pxor        mm6, mm6
+    punpcklwd   mm0, mm1                ; mm0=BEL
+    punpckhwd   mm6, mm1                ; mm6=BEH
+    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm5, mm0
+    paddd       mm4, mm6
+    paddd       mm5, mm1
+    paddd       mm4, mm1
+    psrld       mm5, SCALEBITS          ; mm5=CbEL
+    psrld       mm4, SCALEBITS          ; mm4=CbEH
+    packssdw    mm5, mm4                ; mm5=CbE
+
+    psllw       mm7, BYTE_BIT
+    por         mm5, mm7                ; mm5=Cb
+    movq        MMWORD [ebx], mm5       ; Save Cb
+
+    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
+    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
+    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    movq        mm7, mm0
+    movq        mm5, mm4
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, MMWORD [wk(4)]
+    paddd       mm4, MMWORD [wk(5)]
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    pxor        mm3, mm3
+    pxor        mm4, mm4
+    punpcklwd   mm3, mm1                ; mm3=ROL
+    punpckhwd   mm4, mm1                ; mm4=ROH
+    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm3
+    paddd       mm5, mm4
+    paddd       mm7, mm1
+    paddd       mm5, mm1
+    psrld       mm7, SCALEBITS          ; mm7=CrOL
+    psrld       mm5, SCALEBITS          ; mm5=CrOH
+    packssdw    mm7, mm5                ; mm7=CrO
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    movq        mm1, mm6
+    movq        mm5, mm4
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(6)]
+    paddd       mm4, MMWORD [wk(7)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    pxor        mm2, mm2
+    pxor        mm4, mm4
+    punpcklwd   mm2, mm3                ; mm2=REL
+    punpckhwd   mm4, mm3                ; mm4=REH
+    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
+
+    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
+
+    paddd       mm1, mm2
+    paddd       mm5, mm4
+    paddd       mm1, mm0
+    paddd       mm5, mm0
+    psrld       mm1, SCALEBITS          ; mm1=CrEL
+    psrld       mm5, SCALEBITS          ; mm5=CrEH
+    packssdw    mm1, mm5                ; mm1=CrE
+
+    psllw       mm7, BYTE_BIT
+    por         mm1, mm7                ; mm1=Cr
+    movq        MMWORD [edx], mm1       ; Save Cr
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    add         ebx, byte SIZEOF_MMWORD                ; outptr1
+    add         edx, byte SIZEOF_MMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [ebx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]  ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [edx], xmm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    add         ebx, byte SIZEOF_XMMWORD                ; outptr1
+    add         edx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337  times 2 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        mm0, mm5                ; mm0=BO
+    movq        mm6, mm4                ; mm6=BE
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, mm1
+    paddd       mm4, mm7
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(0)]
+    paddd       mm4, MMWORD [wk(1)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+    alignz      32
+
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
+%endif
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
+
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
+
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
+
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    mov         T1, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         ecx, 16
+    shl         esi, 16
+
+    or          eax, ecx
+    or          edx, esi
+
+    not         eax
+    not         edx
+
+    mov         edi, ZEROBITS
+
+    mov         INT [edi], eax
+    mov         INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO    xmm7
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define LEN     ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 4
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         ZEROBITS, T0
+    mov         LEN, INT [eax + 16]
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    and         K, -16
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    mov         LENEND, LEN
+    and         LENEND, 7
+
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO    xmm7
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T0w      cx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define KK      ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+%define EOB       INT [esp + 5 * 4 + 4]
+%define LEN       INT [esp + 5 * 4 + 8]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 16
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         K,  INT [eax + 16]
+    mov         INT [T0 + 2 * SIZEOF_INT], -1
+    mov         INT [T0 + 3 * SIZEOF_INT], -1
+    mov         ZEROBITS, T0
+    mov         LEN, K
+    pxor        ZERO, ZERO
+    and         K, -16
+    mov         EOB, 0
+    xor         KK, KK
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 2
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    mov         LENEND, LEN
+
+    test        LENEND, 8
+    jz          .TRYR7
+    test        LENEND, 7
+    jz          .TRYR8
+
+    and         LENEND, 7
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    and         LENEND, 7
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    mov         eax, EOB
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    ; ecx can possibly be 8, 16, 24
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mm2, mm0
+    movq        mm3, mm1
+
+    pand        mm0, mm6
+    psrlw       mm2, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm3, BYTE_BIT
+
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+    psrlw       mm0, 1
+    psrlw       mm1, 1
+
+    packuswb    mm0, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         short .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pand        mm0, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    pand        mm2, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm3, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm2, mm4
+    paddw       mm3, mm5
+
+    paddw       mm0, mm1
+    paddw       mm2, mm3
+    paddw       mm0, mm7
+    paddw       mm2, mm7
+    psrlw       mm0, 2
+    psrlw       mm2, 2
+
+    packuswb    mm0, mm2
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         near .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         short .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [edx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2                     ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3                     ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6                     ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7                     ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [GOTOFF(eax,PW_MF0228)]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [GOTOFF(eax,PW_F0402)]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm4, ymm4, 1                        ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                        ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm0, ymm0, 1                        ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                        ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2                     ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3                     ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6                     ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7                     ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4                ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5                ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [esi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                           JDIMENSION input_row, JSAMPARRAY output_buf,
+;                           int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+    movq        mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+    pcmpeqw     mm4, mm4
+    pcmpeqw     mm7, mm7
+    psrlw       mm4, BYTE_BIT
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+    movq        mm0, mm4                ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        mm4, mm5                ; mm4=Cb(0246)=CbE
+    psrlw       mm5, BYTE_BIT           ; mm5=Cb(1357)=CbO
+    pand        mm0, mm1                ; mm0=Cr(0246)=CrE
+    psrlw       mm1, BYTE_BIT           ; mm1=Cr(1357)=CrO
+
+    paddw       mm4, mm7
+    paddw       mm5, mm7
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm2, mm4                ; mm2=CbE
+    movq        mm3, mm5                ; mm3=CbO
+    paddw       mm4, mm4                ; mm4=2*CbE
+    paddw       mm5, mm5                ; mm5=2*CbO
+    movq        mm6, mm0                ; mm6=CrE
+    movq        mm7, mm1                ; mm7=CrO
+    paddw       mm0, mm0                ; mm0=2*CrE
+    paddw       mm1, mm1                ; mm1=2*CrO
+
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbE * -FIX(0.22800))
+    pmulhw      mm5, [GOTOFF(eax,PW_MF0228)]  ; mm5=(2*CbO * -FIX(0.22800))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrE * FIX(0.40200))
+    pmulhw      mm1, [GOTOFF(eax,PW_F0402)]   ; mm1=(2*CrO * FIX(0.40200))
+
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    paddw       mm5, [GOTOFF(eax,PW_ONE)]
+    psraw       mm4, 1                  ; mm4=(CbE * -FIX(0.22800))
+    psraw       mm5, 1                  ; mm5=(CbO * -FIX(0.22800))
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    paddw       mm1, [GOTOFF(eax,PW_ONE)]
+    psraw       mm0, 1                  ; mm0=(CrE * FIX(0.40200))
+    psraw       mm1, 1                  ; mm1=(CrO * FIX(0.40200))
+
+    paddw       mm4, mm2
+    paddw       mm5, mm3
+    paddw       mm4, mm2                ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       mm5, mm3                ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       mm0, mm6                ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       mm1, mm7                ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+    movq        MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    punpcklwd   mm2, mm6
+    punpckhwd   mm4, mm6
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm3, mm7
+    punpckhwd   mm5, mm7
+    pmaddwd     mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm4, SCALEBITS
+    paddd       mm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm3, SCALEBITS
+    psrad       mm5, SCALEBITS
+
+    packssdw    mm2, mm4                ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    mm3, mm5                ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       mm2, mm6                ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       mm3, mm7                ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movq        mm5, MMWORD [esi]       ; mm5=Y(01234567)
+
+    pcmpeqw     mm4, mm4
+    psrlw       mm4, BYTE_BIT           ; mm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm4, mm5                ; mm4=Y(0246)=YE
+    psrlw       mm5, BYTE_BIT           ; mm5=Y(1357)=YO
+
+    paddw       mm0, mm4                ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm5                ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm4                ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm5                ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4,  MMWORD [wk(0)]    ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+    paddw       mm5,  MMWORD [wk(1)]    ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .nextrow
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .nextrow
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [GOTOFF(eax,PW_MF0228)]  ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [GOTOFF(eax,PW_F0402)]   ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm5, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm1, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [esi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
+    movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
+
+    pxor        mm1, mm1                ; mm1=(all 0's)
+    pcmpeqw     mm3, mm3
+    psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    movq        mm4, mm6
+    punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
+    punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
+    movq        mm0, mm7
+    punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
+    punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
+
+    paddw       mm6, mm3
+    paddw       mm4, mm3
+    paddw       mm7, mm3
+    paddw       mm0, mm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm5, mm6                ; mm5=CbH
+    movq        mm2, mm4                ; mm2=CbL
+    paddw       mm6, mm6                ; mm6=2*CbH
+    paddw       mm4, mm4                ; mm4=2*CbL
+    movq        mm1, mm7                ; mm1=CrH
+    movq        mm3, mm0                ; mm3=CrL
+    paddw       mm7, mm7                ; mm7=2*CrH
+    paddw       mm0, mm0                ; mm0=2*CrL
+
+    pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
+    pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
+
+    paddw       mm6, [GOTOFF(eax,PW_ONE)]
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
+    psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
+    paddw       mm7, [GOTOFF(eax,PW_ONE)]
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
+    psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
+
+    paddw       mm6, mm5
+    paddw       mm4, mm2
+    paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+    movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+    movq        mm6, mm5
+    movq        mm7, mm2
+    punpcklwd   mm5, mm1
+    punpckhwd   mm6, mm1
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm2, mm3
+    punpckhwd   mm7, mm3
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm5, SCALEBITS
+    psrad       mm6, SCALEBITS
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm7, SCALEBITS
+
+    packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+    movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+    movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+    pcmpeqw     mm6, mm6
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm6, mm7                ; mm6=Y(0246)=YE
+    psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
+
+    movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
+    movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
+    movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
+
+    paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+    paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          near .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .endcolumn
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .endcolumn
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, JDIMENSION [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [GOTOFF(eax,PW_MF0228)]  ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [GOTOFF(eax,PW_F0402)]   ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    vpcmpeqb    xmm6, xmm6, xmm6
+    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
+    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+
+    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpcmpeqb    xmm1, xmm1, xmm1
+    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm3, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm3, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm0, ymm1, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm1, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm1, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm1, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm7, ymm1, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm1, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm1, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm1, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD    ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr
+    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE   times 4 dw 1
+PW_TWO   times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        mm0, mm0                ; mm0=(all 0's)
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     mm6, mm6
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, mm1
+    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
+    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
+    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
+
+    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
+    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
+
+    movq        mm7, mm1
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+    movq        mm4, mm1
+    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
+    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
+    movq        mm5, mm2
+    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
+    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
+    movq        mm6, mm3
+    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
+    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
+
+    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       mm2, mm1
+    paddw       mm5, mm4
+    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
+    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
+    paddw       mm3, mm1
+    paddw       mm6, mm4
+    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
+    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
+
+    psllw       mm3, BYTE_BIT
+    psllw       mm6, BYTE_BIT
+    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
+    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
+    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+    pand        mm1, mm7                ; mm1=( 0 - - -)
+    pand        mm2, mm7                ; mm2=( 0 - - -)
+
+    movq        MMWORD [wk(0)], mm1
+    movq        MMWORD [wk(1)], mm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     mm1, mm1
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+    movq        mm2, mm1
+
+    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
+    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
+    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+.upsample:
+    ; -- process the upper row
+
+    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
+    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
+
+    movq        mm0, mm7
+    movq        mm4, mm3
+    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+    movq        mm5, mm7
+    movq        mm6, mm3
+    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+    por         mm0, mm4                         ; mm0=( 1 2 3 4)
+    por         mm5, mm6                         ; mm5=( 3 4 5 6)
+
+    movq        mm1, mm7
+    movq        mm2, mm3
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+    movq        mm4, mm3
+    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
+    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
+
+    movq        MMWORD [wk(0)], mm4
+
+    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm7
+    paddw       mm5, mm3
+    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
+    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
+    paddw       mm0, mm7
+    paddw       mm2, mm3
+    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
+    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
+
+    psllw       mm0, BYTE_BIT
+    psllw       mm2, BYTE_BIT
+    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+    ; -- process the lower row
+
+    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
+    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
+
+    movq        mm7, mm6
+    movq        mm3, mm4
+    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+    movq        mm0, mm6
+    movq        mm2, mm4
+    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+    por         mm7, mm3                         ; mm7=( 1 2 3 4)
+    por         mm0, mm2                         ; mm0=( 3 4 5 6)
+
+    movq        mm1, mm6
+    movq        mm5, mm4
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+    movq        mm3, mm4
+    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
+    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
+
+    movq        MMWORD [wk(1)], mm3
+
+    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm6
+    paddw       mm0, mm4
+    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
+    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
+    paddw       mm7, mm6
+    paddw       mm5, mm4
+    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
+    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
+
+    psllw       mm7, BYTE_BIT
+    psllw       mm5, BYTE_BIT
+    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       xmm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
+    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
+    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
+    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
+    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
+    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
+    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
+    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
+    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+    femms                               ; empty MMX/3DNow! state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, byte 4*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [GOTOFF(ebx,PW_F0707)]  ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z3
+
+    movdqa      xmm4, xmm2                    ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F0382)]  ; xmm2=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1306)]  ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2                    ; xmm4=z2
+    paddw       xmm0, xmm2                    ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F0707)]  ; xmm0=z3
+
+    movdqa      xmm4, xmm7                    ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [GOTOFF(ebx,PW_F0382)]  ; xmm7=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1306)]  ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7                    ; xmm4=z2
+    paddw       xmm1, xmm7                    ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(ebp)]  ; (DCTELEM *)
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    psllw       mm5, PASS1_BITS         ; mm5=data0
+    psllw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm1, DESCALE_P1
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm2, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm3, DESCALE_P1
+    psrad       mm5, DESCALE_P1
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       mm5, PASS1_BITS         ; mm5=data0
+    psraw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm6, DESCALE_P2
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm2, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm3, DESCALE_P2
+    psrad       mm5, DESCALE_P2
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F130_F054)]   ; xmm7=data2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=data2H
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm4=data6L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm0=data6H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3H
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F117_F078)]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=data2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F130_F054)]   ; xmm2=data2H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=data6L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm6=data6H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm1=z3H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F117_F078)]   ; xmm2=z4L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd 1.414213562373095048801689
+PD_1_847        times 2 dd 1.847759065022573512256366
+PD_1_082        times 2 dd 1.082392200292393968799446
+PD_2_613        times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/2                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    pushpic     ebx                     ; save GOT address
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    or          eax, ebx
+    poppic      ebx                     ; restore GOT address
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0
+    punpckhdq   mm1, mm1
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    punpcklwd   mm1, mm1
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+    pi2fd       mm1, mm1
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm5, mm5
+    punpcklwd   mm1, mm1
+    psrad       mm5, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm5, mm5
+    pi2fd       mm1, mm1
+
+    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 01)
+    pfadd       mm7, mm3                ; mm7=data1=(10 11)
+    pfsub       mm5, mm1                ; mm5=data7=(70 71)
+    pfsub       mm0, mm3                ; mm0=data6=(60 61)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, mm6                ; transpose coefficients
+    punpckldq   mm6, mm7                ; mm6=(00 10)
+    punpckhdq   mm1, mm7                ; mm1=(01 11)
+    movq        mm3, mm0                ; transpose coefficients
+    punpckldq   mm0, mm5                ; mm0=(60 70)
+    punpckhdq   mm3, mm5                ; mm3=(61 71)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm6, mm7
+    movq        mm1, mm5
+    pfadd       mm7, mm2                ; mm7=data2=(20 21)
+    pfadd       mm5, mm4                ; mm5=data4=(40 41)
+    pfsub       mm6, mm2                ; mm6=data5=(50 51)
+    pfsub       mm1, mm4                ; mm1=data3=(30 31)
+
+    movq        mm0, mm7                ; transpose coefficients
+    punpckldq   mm7, mm1                ; mm7=(20 30)
+    punpckhdq   mm0, mm1                ; mm0=(21 31)
+    movq        mm3, mm5                ; transpose coefficients
+    punpckldq   mm5, mm6                ; mm5=(40 50)
+    punpckhdq   mm3, mm6                ; mm3=(41 51)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/2                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 10)
+    pfadd       mm7, mm3                ; mm7=data1=(01 11)
+    pfsub       mm5, mm1                ; mm5=data7=(07 17)
+    pfsub       mm0, mm3                ; mm0=data6=(06 16)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm3, mm3
+    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
+    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
+    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
+    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+    pand        mm6, mm3                ; mm6=(00 -- 10 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
+    pand        mm0, mm3                ; mm0=(06 -- 16 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
+    por         mm6, mm7                ; mm6=(00 01 10 11)
+    por         mm0, mm5                ; mm0=(06 07 16 17)
+
+    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
+    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm7, mm1
+    movq        mm5, mm3
+    pfadd       mm1, mm2                ; mm1=data2=(02 12)
+    pfadd       mm3, mm4                ; mm3=data4=(04 14)
+    pfsub       mm7, mm2                ; mm7=data5=(05 15)
+    pfsub       mm5, mm4                ; mm5=data3=(03 13)
+
+    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm4, mm4
+    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
+    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
+    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
+    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+    pand        mm3, mm4                ; mm3=(04 -- 14 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
+    pand        mm1, mm4                ; mm1=(02 -- 12 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
+    por         mm3, mm7                ; mm3=(04 05 14 15)
+    por         mm1, mm5                ; mm1=(02 03 12 13)
+
+    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
+
+    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
+    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
+    paddb       mm6, mm2
+    paddb       mm1, mm2
+
+    movq        mm4, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 2*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_0_125       times 4 dd  0.125        ; 1/8
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
+    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
+    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
+
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
+
+    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
+    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
+    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
+    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
+    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
+    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
+    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
+    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
+
+    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
+    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
+    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
+    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
+    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
+    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
+    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
+    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
+
+    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
+    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
+    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
+    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
+
+    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
+    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
+
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
+    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
+    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
+    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
+
+    mulps       xmm6, xmm1              ; descale(1/8)
+    mulps       xmm7, xmm1              ; descale(1/8)
+    mulps       xmm5, xmm1              ; descale(1/8)
+    mulps       xmm0, xmm1              ; descale(1/8)
+
+    movhlps     xmm3, xmm6
+    movhlps     xmm1, xmm7
+    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
+    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
+    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
+    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
+
+    movhlps     xmm6, xmm5
+    movhlps     xmm7, xmm0
+    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
+    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
+    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
+    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
+    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
+    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
+
+    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
+    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
+
+    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm5, xmm3
+    movaps      xmm0, xmm1
+    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
+    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
+    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
+
+    mulps       xmm3, xmm6              ; descale(1/8)
+    mulps       xmm1, xmm6              ; descale(1/8)
+    mulps       xmm5, xmm6              ; descale(1/8)
+    mulps       xmm0, xmm6              ; descale(1/8)
+
+    movhlps     xmm7, xmm3
+    movhlps     xmm2, xmm1
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
+    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
+    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
+    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
+    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
+
+    movhlps     xmm4, xmm5
+    movhlps     xmm6, xmm0
+    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
+    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
+    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
+    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
+    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
+    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm6
+    paddb       mm1, mm6
+    paddb       mm2, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
+    movq        mm3, mm2                ; transpose coefficients(phase 1)
+    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm6, mm3                ; transpose coefficients(phase 2)
+    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
+
+    movq        mm1, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414       times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 01 02 03)
+    paddw       mm7, mm0                ; mm7=data1=(10 11 12 13)
+    psubw       mm1, mm3                ; mm1=data7=(70 71 72 73)
+    psubw       mm5, mm0                ; mm5=data6=(60 61 62 63)
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    movq        mm3, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm3, mm7                ; mm3=(02 12 03 13)
+    movq        mm0, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm1                ; mm5=(60 70 61 71)
+    punpckhwd   mm0, mm1                ; mm0=(62 72 63 73)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm7
+    movq        mm0, mm1
+    paddw       mm7, mm4                ; mm7=data2=(20 21 22 23)
+    paddw       mm1, mm2                ; mm1=data4=(40 41 42 43)
+    psubw       mm5, mm4                ; mm5=data5=(50 51 52 53)
+    psubw       mm0, mm2                ; mm0=data3=(30 31 32 33)
+
+    movq        mm4, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm0                ; mm7=(20 30 21 31)
+    punpckhwd   mm4, mm0                ; mm4=(22 32 23 33)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm5                ; mm1=(40 50 41 51)
+    punpckhwd   mm2, mm5                ; mm2=(42 52 43 53)
+
+    movq        mm0, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(00 10 20 30)
+    punpckhdq   mm0, mm7                ; mm0=(01 11 21 31)
+    movq        mm5, mm3                ; transpose coefficients(phase 2)
+    punpckldq   mm3, mm4                ; mm3=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+    movq        mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm7                ; mm1=(40 50 60 70)
+    punpckhdq   mm6, mm7                ; mm6=(41 51 61 71)
+    movq        mm0, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(42 52 62 72)
+    punpckhdq   mm0, mm4                ; mm0=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_IFAST_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 10 20 30)
+    paddw       mm7, mm0                ; mm7=data1=(01 11 21 31)
+    psraw       mm6, (PASS1_BITS+3)     ; descale
+    psraw       mm7, (PASS1_BITS+3)     ; descale
+    psubw       mm1, mm3                ; mm1=data7=(07 17 27 37)
+    psubw       mm5, mm0                ; mm5=data6=(06 16 26 36)
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    packsswb    mm6, mm5                ; mm6=(00 10 20 30 06 16 26 36)
+    packsswb    mm7, mm1                ; mm7=(01 11 21 31 07 17 27 37)
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=tmp2
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm3
+    movq        mm1, mm0
+    paddw       mm3, mm4                ; mm3=data2=(02 12 22 32)
+    paddw       mm0, mm2                ; mm0=data4=(04 14 24 34)
+    psraw       mm3, (PASS1_BITS+3)     ; descale
+    psraw       mm0, (PASS1_BITS+3)     ; descale
+    psubw       mm5, mm4                ; mm5=data5=(05 15 25 35)
+    psubw       mm1, mm2                ; mm1=data3=(03 13 23 33)
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+
+    movq        mm4, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm4=[PB_CENTERJSAMP]
+
+    packsswb    mm3, mm0                ; mm3=(02 12 22 32 04 14 24 34)
+    packsswb    mm1, mm5                ; mm1=(03 13 23 33 05 15 25 35)
+
+    paddb       mm6, mm4
+    paddb       mm7, mm4
+    paddb       mm3, mm4
+    paddb       mm1, mm4
+
+    movq        mm2, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm7                ; mm6=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm7                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm0, mm3                ; transpose coefficients(phase 1)
+    punpcklbw   mm3, mm1                ; mm3=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm0, mm1                ; mm0=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm3                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm3                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm4, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm4, mm2                ; mm4=(24 25 26 27 34 35 36 37)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13 14 15 16 17)
+    movq        mm1, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1414)]  ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1847)]   ; xmm5=z5
+    pmulhw      xmm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1414)]  ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1847)]   ; xmm4=z5
+    pmulhw      xmm6, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %3, %8, %5
+    vmovdqu     %11, %3                 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %3, %8, %5
+    vmovdqu     %9, %3                  ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %3, %1, %6
+    vmovdqu     %12, %3                 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %3, %1, %6
+    vmovdqu     %10, %3                 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vmovdqu     %3, %9
+    vmovdqu     %4, %10
+
+    vpaddd      %1, %3, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %4, %8              ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %3, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %4, %8              ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vmovdqu     %7, %11
+    vmovdqu     %8, %12
+
+    vpaddd      %2, %7, %5              ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %3, %8, %6              ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpackssdw   %2, %2, %3              ; %2=data3_2
+
+    vpsubd      %3, %7, %5              ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %6, %8, %6              ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %6, %6, DESCALE_P %+ %13
+    vpackssdw   %3, %3, %6              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+    vpaddb      ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    mov         edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         12
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm3=[PD_DESCALE_P1]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 01 02 03)
+    packssdw    mm2, mm0                ; mm2=data7=(70 71 72 73)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm1=[PD_DESCALE_P1]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P1
+    psrad       mm3, DESCALE_P1
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm4, mm3                ; mm4=data1=(10 11 12 13)
+    packssdw    mm7, mm0                ; mm7=data6=(60 61 62 63)
+
+    movq        mm6, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm4                ; mm5=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm1, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm2                ; mm7=(60 70 61 71)
+    punpckhwd   mm1, mm2                ; mm1=(62 72 63 73)
+
+    movq        mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+    movq        mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+    movq        mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+    movq        mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+    movq        MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+    movq        MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+    movq        mm5, mm3
+    movq        mm6, mm0
+    paddd       mm3, mm4                ; mm3=data2L
+    paddd       mm0, mm2                ; mm0=data2H
+    psubd       mm5, mm4                ; mm5=data5L
+    psubd       mm6, mm2                ; mm6=data5H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm7=[PD_DESCALE_P1]
+
+    paddd       mm3, mm7
+    paddd       mm0, mm7
+    psrad       mm3, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm5, mm7
+    paddd       mm6, mm7
+    psrad       mm5, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm3, mm0                ; mm3=data2=(20 21 22 23)
+    packssdw    mm5, mm6                ; mm5=data5=(50 51 52 53)
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+    movq        mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+    movq        mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+    movq        mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+    movq        mm0, mm1
+    movq        mm6, mm4
+    paddd       mm1, mm2                ; mm1=data3L
+    paddd       mm4, mm7                ; mm4=data3H
+    psubd       mm0, mm2                ; mm0=data4L
+    psubd       mm6, mm7                ; mm6=data4H
+
+    movq        mm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm2=[PD_DESCALE_P1]
+
+    paddd       mm1, mm2
+    paddd       mm4, mm2
+    psrad       mm1, DESCALE_P1
+    psrad       mm4, DESCALE_P1
+    paddd       mm0, mm2
+    paddd       mm6, mm2
+    psrad       mm0, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm1, mm4                ; mm1=data3=(30 31 32 33)
+    packssdw    mm0, mm6                ; mm0=data4=(40 41 42 43)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+    movq        mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+    movq        mm4, mm3                ; transpose coefficients(phase 1)
+    punpcklwd   mm3, mm1                ; mm3=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm6, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm5                ; mm0=(40 50 41 51)
+    punpckhwd   mm6, mm5                ; mm6=(42 52 43 53)
+
+    movq        mm1, mm7                ; transpose coefficients(phase 2)
+    punpckldq   mm7, mm3                ; mm7=(00 10 20 30)
+    punpckhdq   mm1, mm3                ; mm1=(01 11 21 31)
+    movq        mm5, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+    movq        mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpckldq   mm0, mm3                ; mm0=(40 50 60 70)
+    punpckhdq   mm7, mm3                ; mm7=(41 51 61 71)
+    movq        mm1, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm4                ; mm6=(42 52 62 72)
+    punpckhdq   mm1, mm4                ; mm1=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm3=[PD_DESCALE_P2]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 10 20 30)
+    packssdw    mm2, mm0                ; mm2=data7=(07 17 27 37)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm1=[PD_DESCALE_P2]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm4, mm3                ; mm4=data1=(01 11 21 31)
+    packssdw    mm7, mm0                ; mm7=data6=(06 16 26 36)
+
+    packsswb    mm5, mm7                ; mm5=(00 10 20 30 06 16 26 36)
+    packsswb    mm4, mm2                ; mm4=(01 11 21 31 07 17 27 37)
+
+    movq        mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+    movq        mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+    movq        mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+    movq        mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+    movq        mm7, mm6
+    movq        mm2, mm1
+    paddd       mm6, mm3                ; mm6=data2L
+    paddd       mm1, mm0                ; mm1=data2H
+    psubd       mm7, mm3                ; mm7=data5L
+    psubd       mm2, mm0                ; mm2=data5H
+
+    movq        mm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm5=[PD_DESCALE_P2]
+
+    paddd       mm6, mm5
+    paddd       mm1, mm5
+    psrad       mm6, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm7, mm5
+    paddd       mm2, mm5
+    psrad       mm7, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    packssdw    mm6, mm1                ; mm6=data2=(02 12 22 32)
+    packssdw    mm7, mm2                ; mm7=data5=(05 15 25 35)
+
+    movq        mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+    movq        mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+    movq        mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+    movq        mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+    movq        mm1, mm4
+    movq        mm2, mm3
+    paddd       mm4, mm0                ; mm4=data3L
+    paddd       mm3, mm5                ; mm3=data3H
+    psubd       mm1, mm0                ; mm1=data4L
+    psubd       mm2, mm5                ; mm2=data4H
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm0=[PD_DESCALE_P2]
+
+    paddd       mm4, mm0
+    paddd       mm3, mm0
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm1, mm0
+    paddd       mm2, mm0
+    psrad       mm1, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    movq        mm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm5=[PB_CENTERJSAMP]
+
+    packssdw    mm4, mm3                ; mm4=data3=(03 13 23 33)
+    packssdw    mm1, mm2                ; mm1=data4=(04 14 24 34)
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=(00 10 20 30 06 16 26 36)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(01 11 21 31 07 17 27 37)
+
+    packsswb    mm6, mm1                ; mm6=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm7                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm5
+    paddb       mm3, mm5
+    paddb       mm6, mm5
+    paddb       mm4, mm5
+
+    movq        mm2, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm3                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm3                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm1, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm4                ; mm6=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm1, mm4                ; mm1=(04 05 14 15 24 25 34 35)
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm6                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm7, mm6                ; mm7=(20 21 22 23 30 31 32 33)
+    movq        mm5, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm2                ; mm1=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm5, mm2                ; mm5=(24 25 26 27 34 35 36 37)
+
+    movq        mm3, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm1                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm3, mm1                ; mm3=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm7                ; transpose coefficients(phase 3)
+    punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm2=z3L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F117_F078)]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm3=[PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm1=[PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm7=[PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm2=[PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F117_F078)]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm1=[PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm2=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm5=[PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm7=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm5=[PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
+PW_F256_F089    times 2 dw  F_2_562,  F_0_899
+PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
+PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
+PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm0, mm1
+    packsswb    mm0, mm0
+    movd        eax, mm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P1_4
+    psrad       mm2, DESCALE_P1_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
+    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P1_4
+    psrad       mm0, DESCALE_P1_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
+    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
+
+    movq        mm6, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm7, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
+    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
+    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
+    movq        mm3, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
+    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P2_4
+    psrad       mm2, DESCALE_P2_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
+    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P2_4
+    psrad       mm0, DESCALE_P2_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
+    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
+    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
+    paddb       mm1, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm1                ; transpose coefficients(phase 1)
+    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    psrlq       mm1, 4*BYTE_BIT
+    psrlq       mm0, 4*BYTE_BIT
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+    pcmpeqd     mm7, mm7
+    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+    movq        mm4, mm0                ; mm4=(10 11 ** 13)
+    movq        mm5, mm2                ; mm5=(50 51 ** 53)
+    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
+    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
+    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
+    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
+    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
+    por         mm0, mm1                ; mm0=(11 31 13 33)
+    por         mm2, mm3                ; mm2=(51 71 53 73)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
+
+    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
+    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
+    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
+    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
+    por         mm6, mm1                ; mm6=(15 35 17 37)
+    por         mm3, mm5                ; mm3=(55 75 57 77)
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
+    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
+
+    ; -- Even part
+
+    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
+    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
+    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
+
+    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
+    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
+    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
+    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
+
+    ; -- Final output stage
+
+    movq        mm3, mm1
+    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
+    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
+    punpckldq   mm1, mm3                ; mm1=(A0 B0)
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
+
+    movq        mm4, mm2
+    movq        mm3, mm5
+    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
+    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
+    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
+    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
+
+    paddd       mm1, mm7
+    psrad       mm1, DESCALE_P1_2
+
+    paddd       mm2, mm7
+    paddd       mm5, mm7
+    psrad       mm2, DESCALE_P1_2
+    psrad       mm5, DESCALE_P1_2
+    paddd       mm4, mm7
+    paddd       mm3, mm7
+    psrad       mm4, DESCALE_P1_2
+    psrad       mm3, DESCALE_P1_2
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
+    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
+
+    ; -- Even part
+
+    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
+
+    ; -- Final output stage
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
+
+    movq        mm6, mm1
+    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
+    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
+
+    paddd       mm1, mm0
+    paddd       mm6, mm0
+    psrad       mm1, DESCALE_P2_2
+    psrad       mm6, DESCALE_P2_2
+
+    movq        mm7, mm1                ; transpose coefficients
+    punpckldq   mm1, mm6                ; mm1=(C0 D0)
+    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
+
+    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
+    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    movd        ecx, mm1
+    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
+    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F256_F089)]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F256_F089)]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm6=[PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm7=[PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F256_F089)]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F106_MF217)]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; xmm1=[PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; xmm2=[PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    pi2fd       mm4, mm4
+    pi2fd       mm2, mm2
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    pi2fd       mm5, mm5
+    pi2fd       mm0, mm0
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    pi2fd       mm6, mm6
+    pi2fd       mm3, mm3
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    pi2fd       mm4, mm4
+    pi2fd       mm1, mm1
+
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
+    movd        mm7, eax
+    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
+    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
+    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
+    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
+    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
+    movq        mm5, mm2
+    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
+    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
+
+    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
+    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
+
+    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
+    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
+    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
+    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
+
+    movq        mm5, mm6
+    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
+    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
+    movq        mm1, mm3
+    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
+    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
+
+    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
+    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        mm6, mm6                ; mm6=(all 0's)
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
+    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
+
+    movq        mm4, mm0
+    punpcklbw   mm0, mm6                ; mm0=(0123)
+    punpckhbw   mm4, mm6                ; mm4=(4567)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm6                ; mm1=(89AB)
+    punpckhbw   mm5, mm6                ; mm5=(CDEF)
+
+    paddw       mm0, mm7
+    paddw       mm4, mm7
+    paddw       mm1, mm7
+    paddw       mm5, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+    movq        mm0, mm2
+    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
+    punpckhbw   mm0, mm6                ; mm0=(KLMN)
+    movq        mm4, mm3
+    punpcklbw   mm3, mm6                ; mm3=(OPQR)
+    punpckhbw   mm4, mm6                ; mm4=(STUV)
+
+    paddw       mm2, mm7
+    paddw       mm0, mm7
+    paddw       mm3, mm7
+    paddw       mm4, mm7
+
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+;                    DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         ah, 2
+    alignx      16, 7
+.quantloop1:
+    mov         al, DCTSIZE2/8/2
+    alignx      16, 7
+.quantloop2:
+    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+    movq        mm0, mm2
+    movq        mm1, mm3
+
+    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
+    psraw       mm3, (WORD_BIT-1)
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    ;
+    ; MMX is an annoyingly crappy instruction set. It has two
+    ; misfeatures that are causing problems here:
+    ;
+    ; - All multiplications are signed.
+    ;
+    ; - The second operand for the shifts is not treated as packed.
+    ;
+    ;
+    ; We work around the first problem by implementing this algorithm:
+    ;
+    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+    ; {
+    ;   enum { SHORT_BIT = 16 };
+    ;   signed short sx = (signed short)x;
+    ;   signed short sy = (signed short)y;
+    ;   signed long sz;
+    ;
+    ;   sz = (long)sx * (long)sy;    /* signed multiply */
+    ;
+    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
+    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
+    ;
+    ;   return (unsigned long)sz;
+    ; }
+    ;
+    ; (note that a negative sx adds _sy_ and vice versa)
+    ;
+    ; For the second problem, we replace the shift by a multiplication.
+    ; Unfortunately that means we have to deal with the signed issue again.
+    ;
+
+    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
+
+    movq        mm4, mm0                ; store current value for later
+    movq        mm5, mm1
+    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
+    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
+    paddw       mm1, mm5  ; so we always need to add the initial value
+                          ; (input value is never negative as we
+                          ; inverted it at the start of this routine)
+
+    ; here it gets a bit tricky as both scale
+    ; and mm0/mm1 can be negative
+    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
+    movq        mm7, MMWORD [SCALE(0,1,edx)]
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pmulhw      mm0, mm6
+    pmulhw      mm1, mm7
+
+    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
+    psraw       mm7, (WORD_BIT-1)
+
+    pand        mm6, mm4                ; and add input if it is
+    pand        mm7, mm5
+    paddw       mm0, mm6
+    paddw       mm1, mm7
+
+    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
+    psraw       mm5, (WORD_BIT-1)
+
+    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
+    pand        mm5, MMWORD [SCALE(0,1,edx)]
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+    add         esi, byte 8*SIZEOF_DCTELEM
+    add         edx, byte 8*SIZEOF_DCTELEM
+    add         edi, byte 8*SIZEOF_JCOEF
+    dec         al
+    jnz         near .quantloop2
+    dec         ah
+    jnz         near .quantloop1        ; to avoid branch misprediction
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
+    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
+    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
+    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
+
+    movlhps     xmm0, xmm1              ; xmm0=(0123)
+    movlhps     xmm2, xmm3              ; xmm2=(4567)
+    movlhps     xmm4, xmm5              ; xmm4=(89AB)
+    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                          FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    movhlps     xmm4, xmm0
+    movhlps     xmm5, xmm1
+
+    cvtps2pi    mm0, xmm0
+    cvtps2pi    mm1, xmm1
+    cvtps2pi    mm4, xmm4
+    cvtps2pi    mm5, xmm5
+
+    movhlps     xmm6, xmm2
+    movhlps     xmm7, xmm3
+
+    cvtps2pi    mm2, xmm2
+    cvtps2pi    mm3, xmm3
+    cvtps2pi    mm6, xmm6
+    cvtps2pi    mm7, xmm7
+
+    packssdw    mm0, mm4
+    packssdw    mm1, mm5
+    packssdw    mm2, mm6
+    packssdw    mm3, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, ymm0, xmm1, 1
+    vinserti128 ymm2, ymm2, xmm3, 1
+    vinserti128 ymm4, ymm4, xmm5, 1
+    vinserti128 ymm6, ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/32
+    alignx      16, 7
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 32*SIZEOF_DCTELEM
+    add         edx, byte 32*SIZEOF_DCTELEM
+    add         edi, byte 32*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
new file mode 100644
index 0000000000..80bc821ff4
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
@@ -0,0 +1,1246 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+    simd_support &= JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+    simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extrgb_ycc_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    mmxfct = jsimd_rgb_ycc_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    mmxfct = jsimd_extrgb_gray_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    mmxfct = jsimd_extrgbx_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    mmxfct = jsimd_extbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    mmxfct = jsimd_extbgrx_gray_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    mmxfct = jsimd_extxbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    mmxfct = jsimd_extxrgb_gray_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    mmxfct = jsimd_rgb_gray_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extrgb_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extbgr_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    mmxfct = jsimd_ycc_rgb_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_fdct_islow_sse2(data);
+  else
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+  else
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+    push        edi
+
+    xor         edi, edi                ; simd support flag
+
+    pushfd
+    pop         eax
+    mov         edx, eax
+    xor         eax, 1<<21              ; flip ID bit in EFLAGS
+    push        eax
+    popfd
+    pushfd
+    pop         eax
+    xor         eax, edx
+    jz          near .return            ; CPUID is not supported
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    xor         eax, eax
+    cpuid
+    test        eax, eax
+    jz          near .return
+    cmp         eax, 7
+    jl          short .no_avx2          ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         eax, 7
+    xor         ecx, ecx
+    cpuid
+    mov         eax, ebx
+    test        eax, 1<<5               ; bit5:AVX2
+    jz          short .no_avx2
+
+    ; Check for AVX2 O/S support
+    mov         eax, 1
+    xor         ecx, ecx
+    cpuid
+    test        ecx, 1<<27
+    jz          short .no_avx2          ; O/S does not support XSAVE
+    test        ecx, 1<<28
+    jz          short .no_avx2          ; CPU does not support AVX2
+
+    xor         ecx, ecx
+    xgetbv
+    and         eax, 6
+    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .no_avx2
+
+    or          edi, JSIMD_AVX2
+.no_avx2:
+
+    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    ; Check for MMX instruction support
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
+    ; Check for 3DNow! instruction support
+    mov         eax, 0x80000000
+    cpuid
+    cmp         eax, 0x80000000
+    jbe         short .return
+
+    mov         eax, 0x80000001
+    cpuid
+    mov         eax, edx                ; eax = Extended feature flags
+
+    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
+    jz          short .no_3dnow
+    or          edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+    mov         eax, edi
+
+    pop         edi
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/jsimd.h b/3rdparty/libjpeg-turbo/src/simd/jsimd.h
new file mode 100644
index 0000000000..64747c6360
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/jsimd.h
@@ -0,0 +1,1258 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2014, Linaro Limited.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE     0x00
+#define JSIMD_MMX      0x01
+#define JSIMD_3DNOW    0x02
+#define JSIMD_SSE      0x04
+#define JSIMD_SSE2     0x08
+#define JSIMD_NEON     0x10
+#define JSIMD_DSPR2    0x20
+#define JSIMD_ALTIVEC  0x40
+#define JSIMD_AVX2     0x80
+#define JSIMD_MMI      0x100
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
+
+/* RGB & extended RGB --> YCC Colorspace Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+/* RGB & extended RGB --> Grayscale Colorspace Conversion */
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+/* YCC --> RGB & extended RGB Colorspace Conversion */
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert_neon
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+/* NULL Colorspace Conversion */
+EXTERN(void) jsimd_c_null_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows, int num_components);
+
+/* h2v1 Downsampling */
+EXTERN(void) jsimd_h2v1_downsample_mmx
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_sse2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_neon
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_altivec
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Downsampling */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_neon
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mmi
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_altivec
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Smooth Downsampling */
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+  (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+   int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+   JDIMENSION image_width);
+
+
+/* Upsampling */
+EXTERN(void) jsimd_h2v1_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+  (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+   int max_v_samp_factor);
+
+EXTERN(void) jsimd_h2v1_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+/* Fancy Upsampling */
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+/* Merged Upsampling */
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+/* Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_sse2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_neon
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_altivec
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+/* Floating Point Sample Conversion */
+EXTERN(void) jsimd_convsamp_float_3dnow
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
+
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
+
+/* Fast Integer Forward DCT */
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
+
+/* Floating Point Forward DCT */
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
+
+/* Quantization */
+EXTERN(void) jsimd_quantize_mmx
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_sse2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_neon
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_dspr2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_mmi
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_altivec
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+/* Floating Point Quantization */
+EXTERN(void) jsimd_quantize_float_3dnow
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_dspr2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+/* Scaled Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+  (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+  (int *workspace, int *output);
+
+/* Accurate Integer Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_dspr2
+  (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_altivec
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Fast Integer Inverse DCT */
+EXTERN(void) jsimd_idct_ifast_mmx
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_neon
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+  (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+   const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+  (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+   const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_altivec
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Floating Point Inverse DCT */
+EXTERN(void) jsimd_idct_float_3dnow
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+/* Huffman coding */
+extern const int jconst_huff_encode_one_block[];
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
new file mode 100644
index 0000000000..d2546eed32
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
@@ -0,0 +1,1147 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+  const char *file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE *f = NULL;
+
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_DSPR2;
+        return;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           /* FIX( 1.082392200 / 2) =  17734 = 0x4546 */
+  0x5A805A80,           /* FIX( 1.414213562 / 2) =  23170 = 0x5A82 */
+  0x76407640,           /* FIX( 1.847759065 / 2) =  30274 = 0x7642 */
+  0xAC60AC60            /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+  jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+                             output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+                                     compptr->v_samp_factor,
+                                     cinfo->max_v_samp_factor,
+                                     cinfo->smoothing_factor,
+                                     compptr->width_in_blocks,
+                                     cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+  jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+                           upsample->v_expand[compptr->component_index],
+                           input_data, output_data_ptr, cinfo->output_width,
+                           cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+  jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+                       workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int workspace[96];
+  int output[12] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col),
+    (int)(output_buf[8] + output_col),
+    (int)(output_buf[9] + output_col),
+    (int)(output_buf[10] + output_col),
+    (int)(output_buf[11] + output_col)
+  };
+
+  jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+  jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int output[8] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col)
+  };
+
+  jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+                         IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE *quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+  jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+                              mips_idct_ifast_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+                              mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ *                          All Rights Reserved.
+ * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
+ *           Darko Laus       <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw          t9, 24(sp)      /* t9 = num_rows */
+    lw          s0, 28(sp)      /* s0 = cinfo->num_components */
+    andi        t0, a0, 3       /* t0 = cinfo->image_width & 3 */
+    beqz        t0, 4f          /* no residual */
+     nop
+0:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+1:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+2:
+    lbu         t3, 0(t2)
+    addiu       t5, t5, 1
+    sb          t3, -1(t5)
+    bne         t6, t5, 2b
+     addu       t2, t2, s0
+3:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 3b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 1b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 0b
+     addiu      a3, a3, 1
+    b           7f
+     nop
+4:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+5:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+6:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 6b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 5b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 4b
+     addiu      a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          t7, 48(sp)      /* t7 = num_rows */
+    li          s0, 0x4c8b      /* FIX(0.29900) */
+    li          s1, 0x9646      /* FIX(0.58700) */
+    li          s2, 0x1d2f      /* FIX(0.11400) */
+    li          s3, 0xffffd4cd  /* -FIX(0.16874) */
+    li          s4, 0xffffab33  /* -FIX(0.33126) */
+    li          s5, 0x8000      /* FIX(0.50000) */
+    li          s6, 0xffff94d1  /* -FIX(0.41869) */
+    li          s7, 0xffffeb2f  /* -FIX(0.08131) */
+    li          t8, 0x807fff    /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+    addiu       t7, -1          /* --num_rows */
+    lw          t6, 0(a1)       /* t6 = input_buf[0] */
+    lw          t0, 0(a2)
+    lw          t1, 4(a2)
+    lw          t2, 8(a2)
+    sll         t3, a3, 2
+    lwx         t0, t3(t0)      /* t0 = output_buf[0][output_row] */
+    lwx         t1, t3(t1)      /* t1 = output_buf[1][output_row] */
+    lwx         t2, t3(t2)      /* t2 = output_buf[2][output_row] */
+
+    addu        t9, t2, a0      /* t9 = end address */
+    addiu       a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo        s5, $ac0
+    mtlo        t8, $ac1
+    mtlo        t8, $ac2
+    maddu       $ac0, s2, t5
+    maddu       $ac1, s5, t5
+    maddu       $ac2, s5, t3
+    maddu       $ac0, s0, t3
+    maddu       $ac1, s3, t3
+    maddu       $ac2, s6, t4
+    maddu       $ac0, s1, t4
+    maddu       $ac1, s4, t4
+    maddu       $ac2, s7, t5
+    extr.w      t3, $ac0, 16
+    extr.w      t4, $ac1, 16
+    extr.w      t5, $ac2, 16
+    sb          t3, 0(t0)
+    sb          t4, 0(t1)
+    sb          t5, 0(t2)
+    addiu       t0, 1
+    addiu       t2, 1
+    bne         t2, t9, 1b
+     addiu      t1, 1
+    bgtz        t7, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r_offs(\outptr)
+    sb          \scratch1, \g_offs(\outptr)
+    sb          \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li          t0, 0xFF
+    sb          t0, \a_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = input_row
+ * a3     = output_buf
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s1, 48(sp)
+    li          t3, 0x8000
+    li          t4, 0x166e9     /* FIX(1.40200) */
+    li          t5, 0x1c5a2     /* FIX(1.77200) */
+    li          t6, 0xffff492e  /* -FIX(0.71414) */
+    li          t7, 0xffffa7e6  /* -FIX(0.34414) */
+    repl.ph     t8, 128
+
+0:
+    lw          s0, 0(a3)
+    lw          t0, 0(a1)
+    lw          t1, 4(a1)
+    lw          t2, 8(a1)
+    sll         s5, a2, 2
+    addiu       s1, -1
+    lwx         s2, s5(t0)
+    lwx         s3, s5(t1)
+    lwx         s4, s5(t2)
+    addu        t9, s2, a0
+    addiu       a2, 1
+
+1:
+    lbu         s7, 0(s4)       /* cr */
+    lbu         s6, 0(s3)       /* cb */
+    lbu         s5, 0(s2)       /* y */
+    addiu       s2, 1
+    addiu       s4, 1
+    addiu       s7, -128
+    addiu       s6, -128
+    mul         t2, t7, s6
+    mul         t0, t6, s7      /* Crgtab[cr] */
+    sll         s7, 15
+    mulq_rs.w   t1, t4, s7      /* Crrtab[cr] */
+    sll         s6, 15
+    addu        t2, t3          /* Cbgtab[cb] */
+    addu        t2, t0
+
+    mulq_rs.w   t0, t5, s6      /* Cbbtab[cb] */
+    sra         t2, 16
+    addu        t1, s5
+    addu        t2, s5          /* add y */
+    ins         t2, t1, 16, 16
+    subu.ph     t2, t2, t8
+    addu        t0, s5
+    shll_s.ph   t2, t2, 8
+    subu        t0, 128
+    shra.ph     t2, t2, 8
+    shll_s.w    t0, t0, 24
+    addu.ph     t2, t2, t8      /* clip & store */
+    sra         t0, t0, 24
+    sra         t1, t2, 16
+    addiu       t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne         s2, t9, 1b
+     addiu      s3, 1
+    bgtz        s1, 0b
+     addiu      a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
+                                              r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li          s0, 0x4c8b      /* s0 = FIX(0.29900) */
+    li          s1, 0x9646      /* s1 = FIX(0.58700) */
+    li          s2, 0x1d2f      /* s2 = FIX(0.11400) */
+    li          s7, 0x8000      /* s7 = FIX(0.50000) */
+    lw          s6, 48(sp)
+    andi        t7, a0, 3
+
+0:
+    addiu       s6, -1          /* s6 = num_rows */
+    lw          t0, 0(a1)
+    lw          t1, 0(a2)
+    sll         t3, a3, 2
+    lwx         t1, t3(t1)
+    addiu       a3, 1
+    addu        t9, t1, a0
+    subu        t8, t9, t7
+    beq         t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    extr.w      t2, $ac1, 16
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t5, $ac0, 16
+    sb          t6, 0(t1)
+    sb          t2, 1(t1)
+    extr.w      t3, $ac1, 16
+    addiu       t1, 4
+    sb          t5, -2(t1)
+    sb          t3, -1(t1)
+    bne         t1, t8, 1b
+     nop
+
+2:
+    beqz        t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    extr.w      t6, $ac0, 16
+    sb          t6, 0(t1)
+    addiu       t1, 1
+    bne         t1, t9, 3b
+     nop
+
+4:
+    bgtz        s6, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          \scratch0, 0xFF
+    sb          \scratch0, \a1_offs(\outptr)
+    sb          \scratch0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw          t9, 56(sp)      /* cinfo->sample_range_limit */
+    lw          v0, 0(a1)
+    lw          v1, 4(a1)
+    lw          t0, 8(a1)
+    sll         t1, a2, 3
+    addiu       t2, t1, 4
+    sll         t3, a2, 2
+    lw          t4, 0(a3)       /* t4 = output_buf[0] */
+    lwx         t1, t1(v0)      /* t1 = input_buf[0][in_row_group_ctr*2] */
+    lwx         t2, t2(v0)      /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+    lwx         t5, t3(v1)      /* t5 = input_buf[1][in_row_group_ctr] */
+    lwx         t6, t3(t0)      /* t6 = input_buf[2][in_row_group_ctr] */
+    lw          t7, 4(a3)       /* t7 = output_buf[1] */
+    li          s1, 0xe6ea
+    addiu       t8, s1, 0x7fff    /* t8 = 0x166e9 [FIX(1.40200)] */
+    addiu       s0, t8, 0x5eb9    /* s0 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s1, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s2, s1, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t3, a0, 1
+    blez        t3, 2f
+     addu       t0, t5, t3      /* t0 = end address */
+ 1:
+    lbu         t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t5, t5, 1
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    lbu         v0, 0(t1)
+    addiu       t6, t6, 1
+    addiu       t1, t1, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, -1(t1)
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, 1(t2)
+    addiu       t2, t2, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne         t0, t5, 1b
+     nop
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     lbu        t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    lbu         v0, 0(t1)
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+    sb          t0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li          t0, 0xe6ea
+    lw          t1, 0(a1)         /* t1 = input_buf[0] */
+    lw          t2, 4(a1)         /* t2 = input_buf[1] */
+    lw          t3, 8(a1)         /* t3 = input_buf[2] */
+    lw          t8, 56(sp)        /* t8 = range_limit */
+    addiu       s1, t0, 0x7fff    /* s1 = 0x166e9 [FIX(1.40200)] */
+    addiu       s2, s1, 0x5eb9    /* s2 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s0, t0, 0x9916    /* s0 = 0x8000 */
+    addiu       s4, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s3, s4, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t0, a0, 1
+    sll         t4, a2, 2
+    lwx         s5, t4(t1)      /* s5 = inptr0 */
+    lwx         s6, t4(t2)      /* s6 = inptr1 */
+    lwx         s7, t4(t3)      /* s7 = inptr2 */
+    lw          t7, 0(a3)       /* t7 = outptr */
+    blez        t0, 2f
+     addu       t9, s6, t0      /* t9 = end address */
+1:
+    lbu         t2, 0(s6)       /* t2 = cb */
+    lbu         t0, 0(s7)       /* t0 = cr */
+    lbu         t1, 0(s5)       /* t1 = y */
+    addiu       t2, t2, -128    /* t2 = cb - 128 */
+    addiu       t0, t0, -128    /* t0 = cr - 128 */
+    mult        $ac1, s4, t2
+    madd        $ac1, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    t5, $ac1, 16
+    mulq_rs.w   t6, s2, t2      /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+    addiu       s7, s7, 1
+    addiu       s6, s6, 1
+    addu        t2, t1, t0      /* t2 = y + cred */
+    addu        t3, t1, t5      /* t3 = y + cgreen */
+    addu        t4, t1, t6      /* t4 = y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t1, 1(s5)
+    lbu         v0, 0(t2)
+    lbu         v1, 0(t3)
+    lbu         ra, 0(t4)
+    addu        t2, t1, t0
+    addu        t3, t1, t5
+    addu        t4, t1, t6
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne         t9, s6, 1b
+     addiu      s5, s5, 2
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     nop
+3:
+    lbu         t2, 0(s6)
+    lbu         t0, 0(s7)
+    lbu         t1, 0(s5)
+    addiu       t2, t2, -128    /* (cb - 128) */
+    addiu       t0, t0, -128    /* (cr - 128) */
+    mul         t3, s4, t2
+    mul         t4, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* (C1*cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   t6, s2, t2      /* (C2*cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, t3, s0
+    addu        t3, t4, t3
+    sra         t5, t3, 16      /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+    addu        t2, t1, t0      /* y + cred */
+    addu        t3, t1, t5      /* y + cgreen */
+    addu        t4, t1, t6      /* y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li            s4, 0
+    lw            s2, 0(a3)       /* s2 = *output_data_ptr */
+0:
+    li            t9, 2
+    lw            s1, -4(a2)      /* s1 = inptr1 */
+
+1:
+    lw            s0, 0(a2)       /* s0 = inptr0 */
+    lwx           s3, s4(s2)
+    addiu         s5, a1, -2      /* s5 = downsampled_width - 2 */
+    srl           t4, s5, 1
+    sll           t4, t4, 1
+    lbu           t0, 0(s0)
+    lbu           t1, 1(s0)
+    lbu           t2, 0(s1)
+    lbu           t3, 1(s1)
+    addiu         s0, 2
+    addiu         s1, 2
+    addu          t8, s0, t4      /* t8 = end address */
+    andi          s5, s5, 1       /* s5 = residual */
+    sll           t4, t0, 1
+    sll           t6, t1, 1
+    addu          t0, t0, t4      /* t0 = (*inptr0++) * 3 */
+    addu          t1, t1, t6      /* t1 = (*inptr0++) * 3 */
+    addu          t7, t0, t2      /* t7 = thiscolsum */
+    addu          t6, t1, t3      /* t5 = nextcolsum */
+    sll           t0, t7, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t7      /* t1 = thiscolsum * 3 */
+    shra_r.w      t0, t0, 4
+    addiu         t1, 7
+    addu          t1, t1, t6
+    srl           t1, t1, 4
+    sb            t0, 0(s3)
+    sb            t1, 1(s3)
+    beq           t8, s0, 22f     /* skip to final iteration if width == 3 */
+     addiu        s3, 2
+2:
+    lh            t0, 0(s0)       /* t0 = A3|A2 */
+    lh            t2, 0(s1)       /* t2 = B3|B2 */
+    addiu         s0, 2
+    addiu         s1, 2
+    preceu.ph.qbr t0, t0          /* t0 = 0|A3|0|A2 */
+    preceu.ph.qbr t2, t2          /* t2 = 0|B3|0|B2 */
+    shll.ph       t1, t0, 1
+    sll           t3, t6, 1
+    addu.ph       t0, t1, t0      /* t0 = A3*3|A2*3 */
+    addu          t3, t3, t6      /* t3 = this * 3 */
+    addu.ph       t0, t0, t2      /* t0 = next2|next1 */
+    addu          t1, t3, t7
+    andi          t7, t0, 0xFFFF  /* t7 = next1 */
+    sll           t2, t7, 1
+    addu          t2, t7, t2      /* t2 = next1*3 */
+    addu          t4, t2, t6
+    srl           t6, t0, 16      /* t6 = next2 */
+    shra_r.w      t1, t1, 4       /* t1 = (this*3 + last + 8) >> 4 */
+    addu          t0, t3, t7
+    addiu         t0, 7
+    srl           t0, t0, 4       /* t0 = (this*3 + next1 + 7) >> 4 */
+    shra_r.w      t4, t4, 4       /* t3 = (next1*3 + this + 8) >> 4 */
+    addu          t2, t2, t6
+    addiu         t2, 7
+    srl           t2, t2, 4       /* t2 = (next1*3 + next2 + 7) >> 4 */
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    sb            t4, 2(s3)
+    sb            t2, 3(s3)
+    bne           t8, s0, 2b
+     addiu        s3, 4
+22:
+    beqz          s5, 4f
+     addu         t8, s0, s5
+3:
+    lbu           t0, 0(s0)
+    lbu           t2, 0(s1)
+    addiu         s0, 1
+    addiu         s1, 1
+    sll           t3, t6, 1
+    sll           t1, t0, 1
+    addu          t1, t0, t1      /* t1 = inptr0 * 3 */
+    addu          t3, t3, t6      /* t3 = thiscolsum * 3 */
+    addu          t5, t1, t2
+    addu          t1, t3, t7
+    shra_r.w      t1, t1, 4
+    addu          t0, t3, t5
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         s3, 2
+    move          t7, t6
+    bne           t8, s0, 3b
+     move         t6, t5
+4:
+    sll           t0, t6, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t6      /* t1 = thiscolsum * 3 */
+    addu          t1, t1, t7
+    addiu         s4, 4
+    shra_r.w      t1, t1, 4
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         t9, -1
+    addiu         s3, 2
+    bnez          t9, 1b
+     lw           s1, 4(a2)
+    srl           t0, s4, 2
+    subu          t0, a0, t0
+    bgtz          t0, 0b
+     addiu        a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j             ra
+     nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz          a0, 3f
+     sll          t0, a0, 2
+    lw            s1, 0(a3)
+    li            s3, 0x10001
+    addu          s0, s1, t0
+0:
+    addiu         t8, a1, -2
+    srl           t9, t8, 2
+    lw            t7, 0(a2)
+    lw            s2, 0(s1)
+    lbu           t0, 0(t7)
+    lbu           t1, 1(t7)       /* t1 = inptr[1] */
+    sll           t2, t0, 1
+    addu          t2, t2, t0      /* t2 = invalue*3 */
+    addu          t2, t2, t1
+    shra_r.w      t2, t2, 2
+    sb            t0, 0(s2)
+    sb            t2, 1(s2)
+    beqz          t9, 11f
+     addiu        s2, 2
+1:
+    ulw           t0, 0(t7)       /* t0 = |P3|P2|P1|P0| */
+    ulw           t1, 1(t7)
+    ulh           t2, 4(t7)       /* t2 = |0|0|P5|P4| */
+    preceu.ph.qbl t3, t0          /* t3 = |0|P3|0|P2| */
+    preceu.ph.qbr t0, t0          /* t0 = |0|P1|0|P0| */
+    preceu.ph.qbr t2, t2          /* t2 = |0|P5|0|P4| */
+    preceu.ph.qbl t4, t1          /* t4 = |0|P4|0|P3| */
+    preceu.ph.qbr t1, t1          /* t1 = |0|P2|0|P1| */
+    shll.ph       t5, t4, 1
+    shll.ph       t6, t1, 1
+    addu.ph       t5, t5, t4      /* t5 = |P4*3|P3*3| */
+    addu.ph       t6, t6, t1      /* t6 = |P2*3|P1*3| */
+    addu.ph       t4, t3, s3
+    addu.ph       t0, t0, s3
+    addu.ph       t4, t4, t5
+    addu.ph       t0, t0, t6
+    shrl.ph       t4, t4, 2       /* t4 = |0|P3|0|P2| */
+    shrl.ph       t0, t0, 2       /* t0 = |0|P1|0|P0| */
+    addu.ph       t2, t2, t5
+    addu.ph       t3, t3, t6
+    shra_r.ph     t2, t2, 2       /* t2 = |0|P5|0|P4| */
+    shra_r.ph     t3, t3, 2       /* t3 = |0|P3|0|P2| */
+    shll.ph       t2, t2, 8
+    shll.ph       t3, t3, 8
+    or            t2, t4, t2
+    or            t3, t3, t0
+    addiu         t9, -1
+    usw           t3, 0(s2)
+    usw           t2, 4(s2)
+    addiu         s2, 8
+    bgtz          t9, 1b
+     addiu        t7, 4
+11:
+    andi          t8, 3
+    beqz          t8, 22f
+     addiu        t7, 1
+
+2:
+    lbu           t0, 0(t7)
+    addiu         t7, 1
+    sll           t1, t0, 1
+    addu          t2, t0, t1      /* t2 = invalue */
+    lbu           t3, -2(t7)
+    lbu           t4, 0(t7)
+    addiu         t3, 1
+    addiu         t4, 2
+    addu          t3, t3, t2
+    addu          t4, t4, t2
+    srl           t3, 2
+    srl           t4, 2
+    sb            t3, 0(s2)
+    sb            t4, 1(s2)
+    addiu         t8, -1
+    bgtz          t8, 2b
+     addiu        s2, 2
+
+22:
+    lbu           t0, 0(t7)
+    lbu           t2, -1(t7)
+    sll           t1, t0, 1
+    addu          t1, t1, t0      /* t1 = invalue * 3 */
+    addu          t1, t1, t2
+    addiu         t1, 1
+    srl           t1, t1, 2
+    sb            t1, 0(s2)
+    sb            t0, 1(s2)
+    addiu         s1, 4
+    bne           s1, s0, 0b
+     addiu        a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j             ra
+     nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)      /* s1 = output_data */
+    lw          s0, 40(sp)      /* s0 = input_data */
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* t0 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    li          s3, 0           /* s3 = bias */
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3      /* t2 = pixval1 */
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3      /* t3 = pixval2 */
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2      /* t5 = loop_end2 */
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz        a2, 8f
+     lw         s1, 52(sp)      /* s1 = output_data */
+    lw          s0, 48(sp)      /* s0 = input_data */
+
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+    andi        t9, a0, 2
+    srl         s2, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* s2 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    lw          s7, 4(s0)       /* s7 = inptr1 */
+    li          s6, 1           /* s6 = bias */
+2:
+    ulw         t0, 0(t5)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t1, 0(s7)       /* t1 = |Q3|Q2|Q1|Q0| */
+    ulw         t2, 4(t5)
+    ulw         t3, 4(s7)
+    precrq.ph.w t7, t0, t1      /* t2 = |P3|P2|Q3|Q2| */
+    ins         t0, t1, 16, 16  /* t0 = |Q1|Q0|P1|P0| */
+    raddu.w.qb  t1, t7
+    raddu.w.qb  t0, t0
+    shra_r.w    t1, t1, 2
+    addiu       t0, 1
+    srl         t0, 2
+    precrq.ph.w t7, t2, t3
+    ins         t2, t3, 16, 16
+    raddu.w.qb  t7, t7
+    raddu.w.qb  t2, t2
+    shra_r.w    t7, t7, 2
+    addiu       t2, 1
+    srl         t2, 2
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t7, 3(t4)
+    addiu       t4, 4
+    addiu       t5, 8
+    addiu       s4, s4, -1
+    bgtz        s4, 2b
+     addiu      s7, 8
+    beqz        t8, 4f
+     addu       t8, t4, t8
+3:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 0(s7)
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t0, t0
+    addu        t0, t0, s6
+    srl         t0, 2
+    xori        s6, s6, 3
+    sb          t0, 0(t4)
+    addiu       t5, 2
+    addiu       t4, 1
+    bne         t8, t4, 3b
+     addiu      s7, 2
+4:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    lbux        t0, t6(s7)
+    sll         t0, 1
+    addu        t1, t1, t0
+    addu        t3, t1, s6
+    srl         t0, t3, 2       /* t2 = pixval1 */
+    xori        s6, s6, 3
+    addu        t2, t1, s6
+    srl         t1, t2, 2       /* t3 = pixval2 */
+    blez        s2, 6f
+     append     t1, t0, 8
+5:
+    ush         t1, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 5b
+     addiu      t4, 2
+6:
+    beqz        t9, 7f
+     nop
+    sb          t0, 0(t4)
+7:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0     = input_data
+ * a1     = output_data
+ * a2     = compptr->v_samp_factor
+ * a3     = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      /* compptr->width_in_blocks */
+    lw          s0, 56(sp)      /* cinfo->image_width */
+    lw          s6, 48(sp)      /* cinfo->smoothing_factor */
+    sll         s7, 3           /* output_cols = width_in_blocks * DCTSIZE */
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      /* t6 = 16384 - tmp_smoot_f * 80 */
+    sll         t7, s6, 4       /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      /*  outptr = output_data[outrow] */
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      /* inptr0 = input_data[inrow] */
+    lwx         t9, t9(a0)      /* inptr1 = input_data[inrow+1] */
+    lwx         s0, s0(a0)      /* above_ptr = input_data[inrow-1] */
+    lwx         s1, s1(a0)      /* below_ptr = input_data[inrow+2] */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      /* end address */
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      /* end address */
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0     = upsample->h_expand[compptr->component_index]
+ * a1     = upsample->v_expand[compptr->component_index]
+ * a2     = input_data
+ * a3     = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw          s0, 0(a3)       /* s0 = output_data */
+    lw          s1, 32(sp)      /* s1 = cinfo->output_width */
+    lw          s2, 36(sp)      /* s2 = cinfo->max_v_samp_factor */
+    li          t6, 0           /* t6 = inrow */
+    beqz        s2, 10f
+     li         s3, 0           /* s3 = outrow */
+0:
+    addu        t0, a2, t6
+    addu        t7, s0, s3
+    lw          t3, 0(t0)       /* t3 = inptr */
+    lw          t8, 0(t7)       /* t8 = outptr */
+    beqz        s1, 4f
+     addu       t5, t8, s1      /* t5 = outend */
+1:
+    lb          t2, 0(t3)       /* t2 = invalue = *inptr++ */
+    addiu       t3, 1
+    beqz        a0, 3f
+     move       t0, a0          /* t0 = h_expand */
+2:
+    sb          t2, 0(t8)
+    addiu       t0, -1
+    bgtz        t0, 2b
+     addiu      t8, 1
+3:
+    bgt         t5, t8, 1b
+     nop
+4:
+    addiu       t9, a1, -1      /* t9 = v_expand - 1 */
+    blez        t9, 9f
+     nop
+5:
+    lw          t3, 0(s0)
+    lw          t4, 4(s0)
+    subu        t0, s1, 0xF
+    blez        t0, 7f
+     addu       t5, t3, s1      /* t5 = end address */
+    andi        t7, s1, 0xF     /* t7 = residual */
+    subu        t8, t5, t7
+6:
+    ulw         t0, 0(t3)
+    ulw         t1, 4(t3)
+    ulw         t2, 8(t3)
+    usw         t0, 0(t4)
+    ulw         t0, 12(t3)
+    usw         t1, 4(t4)
+    usw         t2, 8(t4)
+    usw         t0, 12(t4)
+    addiu       t3, 16
+    bne         t3, t8, 6b
+     addiu      t4, 16
+    beqz        t7, 8f
+     nop
+7:
+    lbu         t0, 0(t3)
+    sb          t0, 0(t4)
+    addiu       t3, 1
+    bne         t3, t5, 7b
+     addiu      t4, 1
+8:
+    addiu       t9, -1
+    bgtz        t9, 5b
+     addiu      s0, 8
+9:
+    addu        s3, s3, a1
+    bne         s3, s2, 0b
+     addiu      t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)       /* t7 = output_data */
+    andi        t8, a1, 0xf     /* t8 = residual */
+    sll         t0, a0, 2
+    blez        a0, 4f
+     addu       t9, t7, t0      /* t9 = output_data end address */
+0:
+    lw          t5, 0(t7)       /* t5 = outptr */
+    lw          t6, 0(a2)       /* t6 = inptr */
+    addu        t3, t5, a1      /* t3 = outptr + output_width (end address) */
+    subu        t3, t8          /* t3 = end address - residual */
+    beq         t5, t3, 2f
+     move       t4, t8
+1:
+    ulw         t0, 0(t6)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t2, 4(t6)       /* t2 = |P7|P6|P5|P4| */
+    srl         t1, t0, 16      /* t1 = |X|X|P3|P2| */
+    ins         t0, t0, 16, 16  /* t0 = |P1|P0|P1|P0| */
+    ins         t1, t1, 16, 16  /* t1 = |P3|P2|P3|P2| */
+    ins         t0, t0, 8, 16   /* t0 = |P1|P1|P0|P0| */
+    ins         t1, t1, 8, 16   /* t1 = |P3|P3|P2|P2| */
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t0, t2, 16      /* t0 = |X|X|P7|P6| */
+    ins         t2, t2, 16, 16  /* t2 = |P5|P4|P5|P4| */
+    ins         t0, t0, 16, 16  /* t0 = |P7|P6|P7|P6| */
+    ins         t2, t2, 8, 16   /* t2 = |P5|P5|P4|P4| */
+    ins         t0, t0, 8, 16   /* t0 = |P7|P7|P6|P6| */
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t5, 16
+    bne         t5, t3, 1b
+     addiu      t6, 8
+    beqz        t8, 3f
+     move       t4, t8
+2:
+    lbu         t1, 0(t6)
+    sb          t1, 0(t5)
+    sb          t1, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    addiu       t7, 4
+    bne         t9, t7, 0b
+     addiu      a2, 4
+4:
+    j           ra
+     nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)
+    blez        a0, 7f
+     andi       t9, a1, 0xf     /* t9 = residual */
+0:
+    lw          t6, 0(a2)       /* t6 = inptr */
+    lw          t5, 0(t7)       /* t5 = outptr */
+    addu        t8, t5, a1      /* t8 = outptr end address */
+    subu        t8, t9          /* t8 = end address - residual */
+    beq         t5, t8, 2f
+     move       t4, t9
+1:
+    ulw         t0, 0(t6)
+    srl         t1, t0, 16
+    ins         t0, t0, 16, 16
+    ins         t0, t0, 8, 16
+    ins         t1, t1, 16, 16
+    ins         t1, t1, 8, 16
+    ulw         t2, 4(t6)
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t3, t2, 16
+    ins         t2, t2, 16, 16
+    ins         t2, t2, 8, 16
+    ins         t3, t3, 16, 16
+    ins         t3, t3, 8, 16
+    usw         t2, 8(t5)
+    usw         t3, 12(t5)
+    addiu       t5, 16
+    bne         t5, t8, 1b
+     addiu      t6, 8
+    beqz        t9, 3f
+     move       t4, t9
+2:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    sb          t0, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    lw          t6, 0(t7)       /* t6 = outptr[0] */
+    lw          t5, 4(t7)       /* t5 = outptr[1] */
+    addu        t4, t6, a1      /* t4 = new end address */
+    beq         a1, t9, 5f
+     subu       t8, t4, t9
+4:
+    ulw         t0, 0(t6)
+    ulw         t1, 4(t6)
+    ulw         t2, 8(t6)
+    usw         t0, 0(t5)
+    ulw         t0, 12(t6)
+    usw         t1, 4(t5)
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t6, 16
+    bne         t6, t8, 4b
+     addiu      t5, 16
+    beqz        t9, 6f
+     nop
+5:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    addiu       t6, 1
+    bne         t6, t4, 5b
+     addiu      t5, 1
+6:
+    addiu       t7, 8
+    addiu       a0, -2
+    bgtz        a0, 0b
+     addiu      a2, 4
+7:
+    j           ra
+     nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -256
+    move        v0, sp
+    addiu       v1, zero, 8     /* v1 = DCTSIZE = 8 */
+1:
+    lh          s4, 32(a0)      /* s4 = inptr[16] */
+    lh          s5, 64(a0)      /* s5 = inptr[32] */
+    lh          s6, 96(a0)      /* s6 = inptr[48] */
+    lh          t1, 112(a0)     /* t1 = inptr[56] */
+    lh          t7, 16(a0)      /* t7 = inptr[8] */
+    lh          t5, 80(a0)      /* t5 = inptr[40] */
+    lh          t3, 48(a0)      /* t3 = inptr[24] */
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, t5
+    or          s4, s4, t7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 2f
+     addiu      v1, v1, -1
+    lh          s5, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lh          s6, 0(a0)       /* inptr[DCTSIZE*0] */
+    mul         s5, s5, s6      /* DEQUANTIZE(inptr[0], quantptr[0]) */
+    sll         s5, s5, 2
+    sw          s5, 0(v0)
+    sw          s5, 32(v0)
+    sw          s5, 64(v0)
+    sw          s5, 96(v0)
+    sw          s5, 128(v0)
+    sw          s5, 160(v0)
+    sw          s5, 192(v0)
+    b           3f
+     sw         s5, 224(v0)
+2:
+    lh          t0, 112(a1)
+    lh          t2, 48(a1)
+    lh          t4, 80(a1)
+    lh          t6, 16(a1)
+    mul         t0, t0, t1      /* DEQUANTIZE(inptr[DCTSIZE*7],
+                                              quantptr[DCTSIZE*7]) */
+    mul         t1, t2, t3      /* DEQUANTIZE(inptr[DCTSIZE*3],
+                                              quantptr[DCTSIZE*3]) */
+    mul         t2, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*5],
+                                              quantptr[DCTSIZE*5]) */
+    mul         t3, t6, t7      /* DEQUANTIZE(inptr[DCTSIZE*1],
+                                              quantptr[DCTSIZE*1]) */
+    lh          t4, 32(a1)
+    lh          t5, 32(a0)
+    lh          t6, 96(a1)
+    lh          t7, 96(a0)
+    addu        s0, t0, t1       /* z3 = tmp0 + tmp2 */
+    addu        s1, t1, t2       /* z2 = tmp1 + tmp2 */
+    addu        s2, t2, t3       /* z4 = tmp1 + tmp3 */
+    addu        s3, s0, s2       /* z3 + z4 */
+    addiu       t9, zero, 9633   /* FIX_1_175875602 */
+    mul         s3, s3, t9       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        t8, t0, t3       /* z1 = tmp0 + tmp3 */
+    addiu       t9, zero, 2446   /* FIX_0_298631336 */
+    mul         t0, t0, t9       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t9, zero, 16819  /* FIX_2_053119869 */
+    mul         t2, t2, t9       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t9, zero, 25172  /* FIX_3_072711026 */
+    mul         t1, t1, t9       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t9, zero, 12299  /* FIX_1_501321110 */
+    mul         t3, t3, t9       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    addiu       t9, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t9       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t9, zero, 3196   /* FIX_0_390180644 */
+    mul         s2, s2, t9       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t9, zero, 7373   /* FIX_0_899976223 */
+    mul         t8, t8, t9       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t9, zero, 20995  /* FIX_2_562915447 */
+    mul         s1, s1, t9       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    subu        s0, s3, s0       /* z3 += z5 */
+    addu        t0, t0, s0       /* tmp0 += z3 */
+    addu        t1, t1, s0       /* tmp2 += z3 */
+    subu        s2, s3, s2       /* z4 += z5 */
+    addu        t2, t2, s2       /* tmp1 += z4 */
+    addu        t3, t3, s2       /* tmp3 += z4 */
+    subu        t0, t0, t8       /* tmp0 += z1 */
+    subu        t1, t1, s1       /* tmp2 += z2 */
+    subu        t2, t2, s1       /* tmp1 += z2 */
+    subu        t3, t3, t8       /* tmp3 += z1 */
+    mul         s0, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                               quantptr[DCTSIZE*2]) */
+    addiu       t9, zero, 6270   /* FIX_0_765366865 */
+    mul         s1, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                               quantptr[DCTSIZE*6]) */
+    lh          t4, 0(a1)
+    lh          t5, 0(a0)
+    lh          t6, 64(a1)
+    lh          t7, 64(a0)
+    mul         s2, t9, s0       /* MULTIPLY(z2, FIX_0_765366865) */
+    mul         t5, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                               quantptr[DCTSIZE*0]) */
+    mul         t6, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                               quantptr[DCTSIZE*4]) */
+    addiu       t9, zero, 4433   /* FIX_0_541196100 */
+    addu        s3, s0, s1       /* z2 + z3 */
+    mul         s3, s3, t9       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t9, zero, 15137  /* FIX_1_847759065 */
+    mul         t8, s1, t9       /* MULTIPLY(z3, FIX_1_847759065) */
+    addu        t4, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 13      /* tmp0 = (z2 + z3) << CONST_BITS */
+    sll         t5, t5, 13      /* tmp1 = (z2 - z3) << CONST_BITS */
+    addu        t7, s3, s2      /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t6, s3, t8      /* tmp2 =
+                                     z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    addu        s0, t4, t7
+    subu        s1, t4, t7
+    addu        s2, t5, t6
+    subu        s3, t5, t6
+    addu        t4, s0, t3
+    subu        s0, s0, t3
+    addu        t3, s2, t1
+    subu        s2, s2, t1
+    addu        t1, s3, t2
+    subu        s3, s3, t2
+    addu        t2, s1, t0
+    subu        s1, s1, t0
+    shra_r.w    t4, t4, 11
+    shra_r.w    t3, t3, 11
+    shra_r.w    t1, t1, 11
+    shra_r.w    t2, t2, 11
+    shra_r.w    s1, s1, 11
+    shra_r.w    s3, s3, 11
+    shra_r.w    s2, s2, 11
+    shra_r.w    s0, s0, 11
+    sw          t4, 0(v0)
+    sw          t3, 32(v0)
+    sw          t1, 64(v0)
+    sw          t2, 96(v0)
+    sw          s1, 128(v0)
+    sw          s3, 160(v0)
+    sw          s2, 192(v0)
+    sw          s0, 224(v0)
+3:
+    addiu       a1, a1, 2
+    addiu       a0, a0, 2
+    bgtz        v1, 1b
+     addiu      v0, v0, 4
+    move        v0, sp
+    addiu       v1, zero, 8
+4:
+    lw          t0, 8(v0)       /* z2 = (JLONG)wsptr[2] */
+    lw          t1, 24(v0)      /* z3 = (JLONG)wsptr[6] */
+    lw          t2, 0(v0)       /* (JLONG)wsptr[0] */
+    lw          t3, 16(v0)      /* (JLONG)wsptr[4] */
+    lw          s4, 4(v0)       /* (JLONG)wsptr[1] */
+    lw          s5, 12(v0)      /* (JLONG)wsptr[3] */
+    lw          s6, 20(v0)      /* (JLONG)wsptr[5] */
+    lw          s7, 28(v0)      /* (JLONG)wsptr[7] */
+    or          s4, s4, t0
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, s7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 5f
+     addiu      v1, v1, -1
+    shra_r.w    s5, t2, 5
+    andi        s5, s5, 0x3ff
+    lbux        s5, s5(a3)
+    lw          s1, 0(a2)
+    replv.qb    s5, s5
+    usw         s5, 0(s1)
+    usw         s5, 4(s1)
+    b           6f
+     nop
+5:
+    addu        t4, t0, t1       /* z2 + z3 */
+    addiu       t8, zero, 4433   /* FIX_0_541196100 */
+    mul         t5, t4, t8       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t8, zero, 15137  /* FIX_1_847759065 */
+    mul         t1, t1, t8       /* MULTIPLY(z3, FIX_1_847759065) */
+    addiu       t8, zero, 6270   /* FIX_0_765366865 */
+    mul         t0, t0, t8       /* MULTIPLY(z2, FIX_0_765366865) */
+    addu        t4, t2, t3       /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+    subu        t2, t2, t3       /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+    sll         t4, t4, 13       /* tmp0 =
+                                      (wsptr[0] + wsptr[4]) << CONST_BITS */
+    sll         t2, t2, 13       /* tmp1 =
+                                      (wsptr[0] - wsptr[4]) << CONST_BITS */
+    subu        t1, t5, t1       /* tmp2 =
+                                      z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    subu        t3, t2, t1       /* tmp12 = tmp1 - tmp2 */
+    addu        t2, t2, t1       /* tmp11 = tmp1 + tmp2 */
+    addu        t5, t5, t0       /* tmp3 =
+                                      z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t1, t4, t5       /* tmp13 = tmp0 - tmp3 */
+    addu        t0, t4, t5       /* tmp10 = tmp0 + tmp3 */
+    lw          t4, 28(v0)       /* tmp0 = (JLONG)wsptr[7] */
+    lw          t6, 12(v0)       /* tmp2 = (JLONG)wsptr[3] */
+    lw          t5, 20(v0)       /* tmp1 = (JLONG)wsptr[5] */
+    lw          t7, 4(v0)        /* tmp3 = (JLONG)wsptr[1] */
+    addu        s0, t4, t6       /* z3 = tmp0 + tmp2 */
+    addiu       t8, zero, 9633   /* FIX_1_175875602 */
+    addu        s1, t5, t7       /* z4 = tmp1 + tmp3 */
+    addu        s2, s0, s1       /* z3 + z4 */
+    mul         s2, s2, t8       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        s3, t4, t7       /* z1 = tmp0 + tmp3 */
+    addu        t9, t5, t6       /* z2 = tmp1 + tmp2 */
+    addiu       t8, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t8       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t8, zero, 3196   /* FIX_0_390180644 */
+    mul         s1, s1, t8       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t8, zero, 2446   /* FIX_0_298631336 */
+    mul         t4, t4, t8       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t8, zero, 7373   /* FIX_0_899976223 */
+    mul         s3, s3, t8       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t8, zero, 16819  /* FIX_2_053119869 */
+    mul         t5, t5, t8       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t8, zero, 20995  /* FIX_2_562915447 */
+    mul         t9, t9, t8       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    addiu       t8, zero, 25172  /* FIX_3_072711026 */
+    mul         t6, t6, t8       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t8, zero, 12299  /* FIX_1_501321110 */
+    mul         t7, t7, t8       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    subu        s0, s2, s0       /* z3 += z5 */
+    subu        s1, s2, s1       /* z4 += z5 */
+    addu        t4, t4, s0
+    subu        t4, t4, s3      /* tmp0 */
+    addu        t5, t5, s1
+    subu        t5, t5, t9      /* tmp1 */
+    addu        t6, t6, s0
+    subu        t6, t6, t9      /* tmp2 */
+    addu        t7, t7, s1
+    subu        t7, t7, s3      /* tmp3 */
+    addu        s0, t0, t7
+    subu        t0, t0, t7
+    addu        t7, t2, t6
+    subu        t2, t2, t6
+    addu        t6, t3, t5
+    subu        t3, t3, t5
+    addu        t5, t1, t4
+    subu        t1, t1, t4
+    shra_r.w    s0, s0, 18
+    shra_r.w    t7, t7, 18
+    shra_r.w    t6, t6, 18
+    shra_r.w    t5, t5, 18
+    shra_r.w    t1, t1, 18
+    shra_r.w    t3, t3, 18
+    shra_r.w    t2, t2, 18
+    shra_r.w    t0, t0, 18
+    andi        s0, s0, 0x3ff
+    andi        t7, t7, 0x3ff
+    andi        t6, t6, 0x3ff
+    andi        t5, t5, 0x3ff
+    andi        t1, t1, 0x3ff
+    andi        t3, t3, 0x3ff
+    andi        t2, t2, 0x3ff
+    andi        t0, t0, 0x3ff
+    lw          s1, 0(a2)
+    lbux        s0, s0(a3)
+    lbux        t7, t7(a3)
+    lbux        t6, t6(a3)
+    lbux        t5, t5(a3)
+    lbux        t1, t1(a3)
+    lbux        t3, t3(a3)
+    lbux        t2, t2(a3)
+    lbux        t0, t0(a3)
+    sb          s0, 0(s1)
+    sb          t7, 1(s1)
+    sb          t6, 2(s1)
+    sb          t5, 3(s1)
+    sb          t1, 4(s1)
+    sb          t3, 5(s1)
+    sb          t2, 6(s1)
+    sb          t0, 7(s1)
+6:
+    addiu       v0, v0, 32
+    bgtz        v1, 4b
+     addiu      a2, a2, 4
+    addiu       sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu         t9, a0, 16      /* end address */
+    or            AT, a3, zero
+
+0:
+    lw            s0, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lw            t0, 0(a0)       /* inptr[DCTSIZE*0] */
+    lw            t1, 16(a0)      /* inptr[DCTSIZE*1] */
+    muleq_s.w.phl v0, t0, s0      /* tmp0 ... */
+    lw            t2, 32(a0)      /* inptr[DCTSIZE*2] */
+    lw            t3, 48(a0)      /* inptr[DCTSIZE*3] */
+    lw            t4, 64(a0)      /* inptr[DCTSIZE*4] */
+    lw            t5, 80(a0)      /* inptr[DCTSIZE*5] */
+    muleq_s.w.phr t0, t0, s0      /* ... tmp0 ... */
+    lw            t6, 96(a0)      /* inptr[DCTSIZE*6] */
+    lw            t7, 112(a0)     /* inptr[DCTSIZE*7] */
+    or            s4, t1, t2
+    or            s5, t3, t4
+    bnez          s4, 1f
+     ins          t0, v0, 16, 16  /* ... tmp0 */
+    bnez          s5, 1f
+     or           s6, t5, t6
+    or            s6, s6, t7
+    bnez          s6, 1f
+     sw           t0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    sw            t0, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            t0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            t0, 48(a2)      /* wsptr[DCTSIZE*3] */
+    sw            t0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            t0, 80(a2)      /* wsptr[DCTSIZE*5] */
+    sw            t0, 96(a2)      /* wsptr[DCTSIZE*6] */
+    sw            t0, 112(a2)     /* wsptr[DCTSIZE*7] */
+    addiu         a0, a0, 4
+    b             2f
+     addiu        a1, a1, 4
+
+1:
+    lw            s1, 32(a1)      /* quantptr[DCTSIZE*2] */
+    lw            s2, 64(a1)      /* quantptr[DCTSIZE*4] */
+    muleq_s.w.phl v0, t2, s1      /* tmp1 ... */
+    muleq_s.w.phr t2, t2, s1      /* ... tmp1 ... */
+    lw            s0, 16(a1)      /* quantptr[DCTSIZE*1] */
+    lw            s1, 48(a1)      /* quantptr[DCTSIZE*3] */
+    lw            s3, 96(a1)      /* quantptr[DCTSIZE*6] */
+    muleq_s.w.phl v1, t4, s2      /* tmp2 ... */
+    muleq_s.w.phr t4, t4, s2      /* ... tmp2 ... */
+    lw            s2, 80(a1)      /* quantptr[DCTSIZE*5] */
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    ins           t2, v0, 16, 16  /* ... tmp1 */
+    muleq_s.w.phl v0, t6, s3      /* tmp3 ... */
+    muleq_s.w.phr t6, t6, s3      /* ... tmp3 ... */
+    ins           t4, v1, 16, 16  /* ... tmp2 */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    ins           t6, v0, 16, 16  /* ... tmp3 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    muleq_s.w.phl v0, t1, s0      /* tmp4 ... */
+    muleq_s.w.phr t1, t1, s0      /* ... tmp4 ... */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    lw            s3, 112(a1)     /* quantptr[DCTSIZE*7] */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    muleq_s.w.phl v1, t7, s3      /* tmp7 ... */
+    muleq_s.w.phr t7, t7, s3      /* ... tmp7 ... */
+    ins           t1, v0, 16, 16  /* ... tmp4 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    muleq_s.w.phl v0, t5, s2      /* tmp6 ... */
+    muleq_s.w.phr t5, t5, s2      /* ... tmp6 ... */
+    ins           t7, v1, 16, 16  /* ... tmp7 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    muleq_s.w.phl v1, t3, s1      /* tmp5 ... */
+    muleq_s.w.phr t3, t3, s1      /* ... tmp5 ... */
+    ins           t5, v0, 16, 16  /* ... tmp6 */
+    ins           t3, v1, 16, 16  /* ... tmp5 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7
+    subq.ph       s1, t0, t7
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    sw            s0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 4
+    addiu         a1, a1, 4
+    sw            s1, 112(a2)     /* wsptr[DCTSIZE*7] */
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    addq.ph       s2, t2, t5
+    addq.ph       t1, s4, t3      /* tmp4 */
+    subq.ph       s3, t2, t5
+    sw            s2, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            s3, 96(a2)      /* wsptr[DCTSIZE*6] */
+    addq.ph       v0, t4, t3
+    subq.ph       v1, t4, t3
+    sw            v0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            v1, 80(a2)      /* wsptr[DCTSIZE*5] */
+    addq.ph       v0, t6, t1
+    subq.ph       v1, t6, t1
+    sw            v0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            v1, 48(a2)      /* wsptr[DCTSIZE*3] */
+
+2:
+    bne           a0, t9, 0b
+     addiu        a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128     /* end address */
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)      /* restore $a3 (mips_idct_ifast_coefs) */
+    lw            t0, 0(a0)       /* wsptr[DCTSIZE*0+0/1]  b a */
+    lw            s0, 16(a0)      /* wsptr[DCTSIZE*1+0/1]  B A */
+    lw            t2, 4(a0)       /* wsptr[DCTSIZE*0+2/3]  d c */
+    lw            s2, 20(a0)      /* wsptr[DCTSIZE*1+2/3]  D C */
+    lw            t4, 8(a0)       /* wsptr[DCTSIZE*0+4/5]  f e */
+    lw            s4, 24(a0)      /* wsptr[DCTSIZE*1+4/5]  F E */
+    lw            t6, 12(a0)      /* wsptr[DCTSIZE*0+6/7]  h g */
+    lw            s6, 28(a0)      /* wsptr[DCTSIZE*1+6/7]  H G */
+    precrq.ph.w   t1, s0, t0      /* B b */
+    ins           t0, s0, 16, 16  /* A a */
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2       /* A a */
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0      /* A A */
+    ins           s0, s0, 16, 16  /* a a */
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0      /* A A A A */
+    precrq.qb.ph  s0, s0, s0      /* a a a a */
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7      /* tmp0 + tmp7 */
+    subq.ph       s7, t0, t7      /* tmp0 - tmp7 */
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3      /* tmp4 */
+    addq.ph       s1, t2, t5      /* tmp1 + tmp6 */
+    subq.ph       s6, t2, t5      /* tmp1 - tmp6 */
+    addq.ph       s2, t4, t3      /* tmp2 + tmp5 */
+    subq.ph       s5, t4, t3      /* tmp2 - tmp5 */
+    addq.ph       s4, t6, t1      /* tmp3 + tmp4 */
+    subq.ph       s3, t6, t1      /* tmp3 - tmp4 */
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0      /* B A */
+    ins           s0, s1, 16, 16  /* b a */
+    precrq.ph.w   t2, s3, s2      /* D C */
+    ins           s2, s3, 16, 16  /* d c */
+    precrq.ph.w   t4, s5, s4      /* F E */
+    ins           s4, s5, 16, 16  /* f e */
+    precrq.ph.w   t6, s7, s6      /* H G */
+    ins           s6, s7, 16, 16  /* h g */
+    precrq.qb.ph  t0, t2, t0      /* D C B A */
+    precrq.qb.ph  s0, s2, s0      /* d c b a */
+    precrq.qb.ph  t4, t6, t4      /* H G F E */
+    precrq.qb.ph  s4, s6, s4      /* h g f e */
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)       /* outptr[0/1/2/3]       d c b a */
+    sw            s4, 4(a3)       /* outptr[4/5/6/7]       h g f e */
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)       /* outptr[0/1/2/3]       D C B A */
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)       /* outptr[4/5/6/7]       H G F E */
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui         t0, 6437
+    ori         t0, 2260
+    lui         t1, 9633
+    ori         t1, 11363
+    lui         t2, 0xd39e
+    ori         t2, 0xe6dc
+    lui         t3, 0xf72d
+    ori         t3, 9633
+    lui         t4, 2261
+    ori         t4, 9633
+    lui         t5, 0xd39e
+    ori         t5, 6437
+    lui         t6, 9633
+    ori         t6, 0xd39d
+    lui         t7, 0xe6dc
+    ori         t7, 2260
+    lui         t8, 4433
+    ori         t8, 10703
+    lui         t9, 0xd630
+    ori         t9, 4433
+    li          s8, 8
+    move        a1, a0
+1:
+    lw          s0, 0(a1)       /* tmp0 = 1|0 */
+    lw          s1, 4(a1)       /* tmp1 = 3|2 */
+    lw          s2, 8(a1)       /* tmp2 = 5|4 */
+    lw          s3, 12(a1)      /* tmp3 = 7|6 */
+    packrl.ph   s1, s1, s1      /* tmp1 = 2|3 */
+    packrl.ph   s3, s3, s3      /* tmp3 = 6|7 */
+    subq.ph     s7, s1, s2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     s5, s0, s3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t0    /* ac0 += t5*  6437 + t4*  2260 */
+    dpa.w.ph    $ac0, s5, t1    /* ac0 += t6*  9633 + t7* 11363 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t2    /* ac1 += t5*-11362 + t4* -6436 */
+    dpa.w.ph    $ac1, s5, t3    /* ac1 += t6* -2259 + t7*  9633 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, s7, t4    /* ac2 += t5*  2261 + t4*  9633 */
+    dpa.w.ph    $ac2, s5, t5    /* ac2 += t6*-11362 + t7*  6437 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, s7, t6    /* ac3 += t5*  9633 + t4*-11363 */
+    dpa.w.ph    $ac3, s5, t7    /* ac3 += t6* -6436 + t7*  2260 */
+    addq.ph     s6, s1, s2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     s4, s0, s3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    extr_r.w    s2, $ac2, 11    /* tmp2 = (ac2 + 1024) >> 11 */
+    extr_r.w    s3, $ac3, 11    /* tmp3 = (ac3 + 1024) >> 11 */
+    addq.ph     s5, s4, s6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     s7, s4, s6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sh          s0, 2(a1)
+    sh          s1, 6(a1)
+    sh          s2, 10(a1)
+    sh          s3, 14(a1)
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t8    /* ac0 += t12*  4433 + t13* 10703 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t9    /* ac1 += t12*-10704 + t13*  4433 */
+    sra         s4, s5, 16      /* tmp4 = t11 */
+    addiu       a1, a1, 16
+    addiu       s8, s8, -1
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    addu        s2, s5, s4      /* tmp2 = t10 + t11 */
+    subu        s3, s5, s4      /* tmp3 = t10 - t11 */
+    sll         s2, s2, 2       /* tmp2 = (t10 + t11) << 2 */
+    sll         s3, s3, 2       /* tmp3 = (t10 - t11) << 2 */
+    sh          s2, -16(a1)
+    sh          s3, -8(a1)
+    sh          s0, -12(a1)
+    bgtz        s8, 1b
+     sh         s1, -4(a1)
+    li          t0, 2260
+    li          t1, 11363
+    li          t2, 9633
+    li          t3, 6436
+    li          t4, 6437
+    li          t5, 2261
+    li          t6, 11362
+    li          t7, 2259
+    li          t8, 4433
+    li          t9, 10703
+    li          a1, 10704
+    li          s8, 8
+
+2:
+    lh          a2, 0(a0)       /* 0 */
+    lh          a3, 16(a0)      /* 8 */
+    lh          v0, 32(a0)      /* 16 */
+    lh          v1, 48(a0)      /* 24 */
+    lh          s4, 64(a0)      /* 32 */
+    lh          s5, 80(a0)      /* 40 */
+    lh          s6, 96(a0)      /* 48 */
+    lh          s7, 112(a0)     /* 56 */
+    addu        s2, v0, s5      /* tmp2 = 16 + 40 */
+    subu        s5, v0, s5      /* tmp5 = 16 - 40 */
+    addu        s3, v1, s4      /* tmp3 = 24 + 32 */
+    subu        s4, v1, s4      /* tmp4 = 24 - 32 */
+    addu        s0, a2, s7      /* tmp0 =  0 + 56 */
+    subu        s7, a2, s7      /* tmp7 =  0 - 56 */
+    addu        s1, a3, s6      /* tmp1 =  8 + 48 */
+    subu        s6, a3, s6      /* tmp6 =  8 - 48 */
+    addu        a2, s0, s3      /* tmp10 = tmp0 + tmp3 */
+    subu        v1, s0, s3      /* tmp13 = tmp0 - tmp3 */
+    addu        a3, s1, s2      /* tmp11 = tmp1 + tmp2 */
+    subu        v0, s1, s2      /* tmp12 = tmp1 - tmp2 */
+    mult        s7, t1          /* ac0  = tmp7 * c1 */
+    madd        s4, t0          /* ac0 += tmp4 * c0 */
+    madd        s5, t4          /* ac0 += tmp5 * c4 */
+    madd        s6, t2          /* ac0 += tmp6 * c2 */
+    mult        $ac1, s7, t2    /* ac1  = tmp7 * c2 */
+    msub        $ac1, s4, t3    /* ac1 -= tmp4 * c3 */
+    msub        $ac1, s5, t6    /* ac1 -= tmp5 * c6 */
+    msub        $ac1, s6, t7    /* ac1 -= tmp6 * c7 */
+    mult        $ac2, s7, t4    /* ac2  = tmp7 * c4 */
+    madd        $ac2, s4, t2    /* ac2 += tmp4 * c2 */
+    madd        $ac2, s5, t5    /* ac2 += tmp5 * c5 */
+    msub        $ac2, s6, t6    /* ac2 -= tmp6 * c6 */
+    mult        $ac3, s7, t0    /* ac3  = tmp7 * c0 */
+    msub        $ac3, s4, t1    /* ac3 -= tmp4 * c1 */
+    madd        $ac3, s5, t2    /* ac3 += tmp5 * c2 */
+    msub        $ac3, s6, t3    /* ac3 -= tmp6 * c3 */
+    extr_r.w    s0, $ac0, 15    /* tmp0 = (ac0 + 16384) >> 15 */
+    extr_r.w    s1, $ac1, 15    /* tmp1 = (ac1 + 16384) >> 15 */
+    extr_r.w    s2, $ac2, 15    /* tmp2 = (ac2 + 16384) >> 15 */
+    extr_r.w    s3, $ac3, 15    /* tmp3 = (ac3 + 16384) >> 15 */
+    addiu       s8, s8, -1
+    addu        s4, a2, a3      /* tmp4 = tmp10 + tmp11 */
+    subu        s5, a2, a3      /* tmp5 = tmp10 - tmp11 */
+    sh          s0, 16(a0)
+    sh          s1, 48(a0)
+    sh          s2, 80(a0)
+    sh          s3, 112(a0)
+    mult        v0, t8          /* ac0  = tmp12 * c8 */
+    madd        v1, t9          /* ac0 += tmp13 * c9 */
+    mult        $ac1, v1, t8    /* ac1  = tmp13 * c8 */
+    msub        $ac1, v0, a1    /* ac1 -= tmp12 * c10 */
+    addiu       a0, a0, 2
+    extr_r.w    s6, $ac0, 15    /* tmp6 = (ac0 + 16384) >> 15 */
+    extr_r.w    s7, $ac1, 15    /* tmp7 = (ac1 + 16384) >> 15 */
+    shra_r.w    s4, s4, 2       /* tmp4 = (tmp4 + 2) >> 2 */
+    shra_r.w    s5, s5, 2       /* tmp5 = (tmp5 + 2) >> 2 */
+    sh          s4, -2(a0)
+    sh          s5, 62(a0)
+    sh          s6, 30(a0)
+    bgtz        s8, 2b
+     sh         s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr          ra
+     nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    li          a1, 0x014e014e  /* FIX_1_306562965 (334 << 16) |
+                                                   (334 & 0xffff) */
+    li          a2, 0x008b008b  /* FIX_0_541196100 (139 << 16) |
+                                                   (139 & 0xffff) */
+    li          a3, 0x00620062  /* FIX_0_382683433 (98 << 16) |
+                                                   (98 & 0xffff) */
+    li          s1, 0x00b500b5  /* FIX_0_707106781 (181 << 16) |
+                                                   (181 & 0xffff) */
+
+    move        v0, a0
+    addiu       v1, v0, 128     /* end address */
+
+0:
+    lw          t0, 0(v0)       /* tmp0 = 1|0 */
+    lw          t1, 4(v0)       /* tmp1 = 3|2 */
+    lw          t2, 8(v0)       /* tmp2 = 5|4 */
+    lw          t3, 12(v0)      /* tmp3 = 7|6 */
+    packrl.ph   t1, t1, t1      /* tmp1 = 2|3 */
+    packrl.ph   t3, t3, t3      /* tmp3 = 6|7 */
+    subq.ph     t7, t1, t2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     t5, t0, t3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    addq.ph     t6, t1, t2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     t4, t0, t3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    addq.ph     t8, t4, t6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     t9, t4, t6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sra         t4, t8, 16      /* tmp4 = t11 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t9, s1
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, t7, a3    /* ac1 += t4*98 + t5*98 */
+    dpsx.w.ph   $ac1, t5, a3    /* ac1 += t6*98 + t7*98 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, t7, a2    /* ac2 += t4*139 + t5*139 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, t5, a1    /* ac3 += t6*334 + t7*334 */
+    precrq.ph.w t0, t5, t7      /* t0 = t5|t6 */
+    addq.ph     t2, t8, t4      /* tmp2 = t10 + t11 */
+    subq.ph     t3, t8, t4      /* tmp3 = t10 - t11 */
+    extr.w      t4, $ac0, 8
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t0, s1    /* ac0 += t5*181 + t6*181 */
+    extr.w      t0, $ac1, 8     /* t0 = z5 */
+    extr.w      t1, $ac2, 8     /* t1 = MULTIPLY(tmp10, 139) */
+    extr.w      t7, $ac3, 8     /* t2 = MULTIPLY(tmp12, 334) */
+    extr.w      t8, $ac0, 8     /* t8 = z3 = MULTIPLY(tmp11, 181) */
+    add         t6, t1, t0      /* t6 = z2 */
+    add         t7, t7, t0      /* t7 = z4 */
+    subq.ph     t0, t5, t8      /* t0 = z13 = tmp7 - z3 */
+    addq.ph     t8, t5, t8      /* t9 = z11 = tmp7 + z3 */
+    addq.ph     t1, t0, t6      /* t1 = z13 + z2 */
+    subq.ph     t6, t0, t6      /* t6 = z13 - z2 */
+    addq.ph     t0, t8, t7      /* t0 = z11 + z4 */
+    subq.ph     t7, t8, t7      /* t7 = z11 - z4 */
+    addq.ph     t5, t4, t9
+    subq.ph     t4, t9, t4
+    sh          t2, 0(v0)
+    sh          t5, 4(v0)
+    sh          t3, 8(v0)
+    sh          t4, 12(v0)
+    sh          t1, 10(v0)
+    sh          t6, 6(v0)
+    sh          t0, 2(v0)
+    sh          t7, 14(v0)
+    addiu       v0, 16
+    bne         v1, v0, 0b
+     nop
+    move        v0, a0
+    addiu       v1, v0, 16
+
+1:
+    lh          t0, 0(v0)       /* 0 */
+    lh          t1, 16(v0)      /* 8 */
+    lh          t2, 32(v0)      /* 16 */
+    lh          t3, 48(v0)      /* 24 */
+    lh          t4, 64(v0)      /* 32 */
+    lh          t5, 80(v0)      /* 40 */
+    lh          t6, 96(v0)      /* 48 */
+    lh          t7, 112(v0)     /* 56 */
+    add         t8, t0, t7      /* t8 = tmp0 */
+    sub         t7, t0, t7      /* t7 = tmp7 */
+    add         t0, t1, t6      /* t0 = tmp1 */
+    sub         t1, t1, t6      /* t1 = tmp6 */
+    add         t6, t2, t5      /* t6 = tmp2 */
+    sub         t5, t2, t5      /* t5 = tmp5 */
+    add         t2, t3, t4      /* t2 = tmp3 */
+    sub         t3, t3, t4      /* t3 = tmp4 */
+    add         t4, t8, t2      /* t4 = tmp10 = tmp0 + tmp3 */
+    sub         t8, t8, t2      /* t8 = tmp13 = tmp0 - tmp3 */
+    sub         s0, t0, t6      /* s0 = tmp12 = tmp1 - tmp2 */
+    ins         t8, s0, 16, 16  /* t8 = tmp12|tmp13 */
+    add         t2, t0, t6      /* t2 = tmp11 = tmp1 + tmp2 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t8, s1    /* ac0 += t12*181 + t13*181 */
+    add         s0, t4, t2      /* t8 = tmp10+tmp11 */
+    sub         t4, t4, t2      /* t4 = tmp10-tmp11 */
+    sh          s0, 0(v0)
+    sh          t4, 64(v0)
+    extr.w      t2, $ac0, 8     /* z1 = MULTIPLY(tmp12+tmp13,
+                                                 FIX_0_707106781) */
+    addq.ph     t4, t8, t2      /* t9 = tmp13 + z1 */
+    subq.ph     t8, t8, t2      /* t2 = tmp13 - z1 */
+    sh          t4, 32(v0)
+    sh          t8, 96(v0)
+    add         t3, t3, t5      /* t3 = tmp10 = tmp4 + tmp5 */
+    add         t0, t5, t1      /* t0 = tmp11 = tmp5 + tmp6 */
+    add         t1, t1, t7      /* t1 = tmp12 = tmp6 + tmp7 */
+    andi        t4, a1, 0xffff
+    mul         s0, t1, t4
+    sra         s0, s0, 8       /* s0 = z4 =
+                                     MULTIPLY(tmp12, FIX_1_306562965) */
+    ins         t1, t3, 16, 16  /* t1 = tmp10|tmp12 */
+    mult        $0, $0          /* ac0  = 0 */
+    mulsa.w.ph  $ac0, t1, a3    /* ac0 += t10*98 - t12*98 */
+    extr.w      t8, $ac0, 8     /* z5 = MULTIPLY(tmp10-tmp12,
+                                                 FIX_0_382683433) */
+    add         t2, t7, t8      /* t2 = tmp7 + z5 */
+    sub         t7, t7, t8      /* t7 = tmp7 - z5 */
+    andi        t4, a2, 0xffff
+    mul         t8, t3, t4
+    sra         t8, t8, 8       /* t8 = z2 =
+                                     MULTIPLY(tmp10, FIX_0_541196100) */
+    andi        t4, s1, 0xffff
+    mul         t6, t0, t4
+    sra         t6, t6, 8       /* t6 = z3 =
+                                     MULTIPLY(tmp11, FIX_0_707106781) */
+    add         t0, t6, t8      /* t0 = z3 + z2 */
+    sub         t1, t6, t8      /* t1 = z3 - z2 */
+    add         t3, t6, s0      /* t3 = z3 + z4 */
+    sub         t4, t6, s0      /* t4 = z3 - z4 */
+    sub         t5, t2, t1      /* t5 = dataptr[5] */
+    sub         t6, t7, t0      /* t6 = dataptr[3] */
+    add         t3, t2, t3      /* t3 = dataptr[1] */
+    add         t4, t7, t4      /* t4 = dataptr[7] */
+    sh          t5, 80(v0)
+    sh          t6, 48(v0)
+    sh          t3, 16(v0)
+    sh          t4, 112(v0)
+    addiu       v0, 2
+    bne         v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu       v0, a2, 124     /* v0 = workspace_end */
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    lh          t2, 128(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    lh          t8, 386(a1)
+
+1:
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    sh          v1, 0(a0)
+
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t2, 128(a1)
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t8, 386(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    sh          s2, 2(a0)
+    lh          t0, 0(a2)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    bne         a2, v0, 1b
+     addiu      a0, a0, 4
+
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    sh          v1, 0(a0)
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    sh          s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j           ra
+     nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    li          t1, 0x46800100  /* integer representation 16384.5 */
+    mtc1        t1, f0
+    li          t0, 63
+0:
+    lwc1        f2, 0(a2)
+    lwc1        f10, 0(a1)
+    lwc1        f4, 4(a2)
+    lwc1        f12, 4(a1)
+    lwc1        f6, 8(a2)
+    lwc1        f14, 8(a1)
+    lwc1        f8, 12(a2)
+    lwc1        f16, 12(a1)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    lwc1        f10, 16(a1)
+    lwc1        f12, 20(a1)
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    lwc1        f14, 24(a1)
+    lwc1        f16, 28(a1)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    lwc1        f2, 16(a2)
+    lwc1        f4, 20(a2)
+    lwc1        f6, 24(a2)
+    lwc1        f8, 28(a2)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    sh          t1, 0(a0)
+    sh          t2, 2(a0)
+    sh          t3, 4(a0)
+    sh          t4, 6(a0)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    addiu       t0, t0, -8
+    addiu       a2, a2, 32
+    addiu       a1, a1, 32
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    sh          t1, 8(a0)
+    sh          t2, 10(a0)
+    sh          t3, 12(a0)
+    sh          t4, 14(a0)
+    bgez        t0, 0b
+     addiu      a0, a0, 16
+
+    j           ra
+     nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu       sp, sp, -40
+    move        v0, sp
+    addiu       s2, zero, 29692
+    addiu       s3, zero, -10426
+    addiu       s4, zero, 6967
+    addiu       s5, zero, -5906
+    lh          t0, 0(a1)       /* t0 = inptr[DCTSIZE*0] */
+    lh          t5, 0(a0)       /* t5 = quantptr[DCTSIZE*0] */
+    lh          t1, 48(a1)      /* t1 = inptr[DCTSIZE*3] */
+    lh          t6, 48(a0)      /* t6 = quantptr[DCTSIZE*3] */
+    mul         t4, t5, t0
+    lh          t0, 16(a1)      /* t0 = inptr[DCTSIZE*1] */
+    lh          t5, 16(a0)      /* t5 = quantptr[DCTSIZE*1] */
+    mul         t6, t6, t1
+    mul         t5, t5, t0
+    lh          t2, 80(a1)      /* t2 = inptr[DCTSIZE*5] */
+    lh          t7, 80(a0)      /* t7 = quantptr[DCTSIZE*5] */
+    lh          t3, 112(a1)     /* t3 = inptr[DCTSIZE*7] */
+    lh          t8, 112(a0)     /* t8 = quantptr[DCTSIZE*7] */
+    mul         t7, t7, t2
+    mult        zero, zero
+    mul         t8, t8, t3
+    li          s0, 0x73FCD746  /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+    li          s1, 0x1B37E8EE  /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+    ins         t6, t5, 16, 16  /* t6 = t5|t6 */
+    sll         t4, t4, 15
+    dpa.w.ph    $ac0, t6, s0
+    lh          t1, 2(a1)
+    lh          t6, 2(a0)
+    ins         t8, t7, 16, 16  /* t8 = t7|t8 */
+    dpa.w.ph    $ac0, t8, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 18(a1)
+    lh          t6, 18(a0)
+    lh          t2, 50(a1)
+    lh          t7, 50(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 82(a1)
+    lh          t2, 82(a0)
+    lh          t3, 114(a1)
+    lh          t4, 114(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 0(v0)
+    sw          t8, 20(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 6(a1)
+    lh          t6, 6(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 22(a1)
+    lh          t6, 22(a0)
+    lh          t2, 54(a1)
+    lh          t7, 54(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 86(a1)
+    lh          t2, 86(a0)
+    lh          t3, 118(a1)
+    lh          t4, 118(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 4(v0)
+    sw          t8, 24(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 10(a1)
+    lh          t6, 10(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 26(a1)
+    lh          t6, 26(a0)
+    lh          t2, 58(a1)
+    lh          t7, 58(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 90(a1)
+    lh          t2, 90(a0)
+    lh          t3, 122(a1)
+    lh          t4, 122(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 8(v0)
+    sw          t8, 28(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 14(a1)
+    lh          t6, 14(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 30(a1)
+    lh          t6, 30(a0)
+    lh          t2, 62(a1)
+    lh          t7, 62(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 94(a1)
+    lh          t2, 94(a0)
+    lh          t3, 126(a1)
+    lh          t4, 126(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 12(v0)
+    sw          t8, 32(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    lw          t9, 0(a2)
+    lw          t3, 0(v0)
+    lw          t7, 4(v0)
+    lw          t1, 8(v0)
+    addu        t9, t9, a3
+    sll         t3, t3, 15
+    subu        t8, t4, t0
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    shra_r.w    t8, t8, 13
+    sw          t0, 16(v0)
+    sw          t8, 36(v0)
+    lw          t5, 12(v0)
+    lw          t6, 16(v0)
+    mult        t7, s2
+    madd        t1, s3
+    madd        t5, s4
+    madd        t6, s5
+    lw          t5, 24(v0)
+    lw          t7, 28(v0)
+    mflo        t0, $ac0
+    lw          t8, 32(v0)
+    lw          t2, 36(v0)
+    mult        $ac1, t5, s2
+    madd        $ac1, t7, s3
+    madd        $ac1, t8, s4
+    madd        $ac1, t2, s5
+    addu        t1, t3, t0
+    subu        t6, t3, t0
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    mflo        t4, $ac1
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    lw          t0, 20(v0)
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    sll         t0, t0, 15
+    lw          t9, 4(a2)
+    addu        t1, t0, t4
+    subu        t6, t0, t4
+    addu        t9, t9, a3
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    addiu       sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j           ra
+     nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0     = compptr->dct_table
+ * a1     = coef_block
+ * a2     = output_buf
+ * a3     = output_col
+ * 16(sp) = workspace[DCTSIZE*4]  (buffers data between passes)
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          v1, 48(sp)
+    move        t0, a1
+    move        t1, v1
+    li          t9, 4
+    li          s0, 0x2e75f93e
+    li          s1, 0x21f9ba79
+    li          s2, 0xecc2efb0
+    li          s3, 0x52031ccd
+
+0:
+    lh          s6, 32(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 32(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 96(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 96(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 0(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 0(a0)       /* quantptr[0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 112(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 112(a0)     /* quantptr[DCTSIZE*7] */
+    lh          v0, 80(t0)      /* inptr[DCTSIZE*5] */
+    lh          s5, 80(a0)      /* quantptr[DCTSIZE*5] */
+    lh          s6, 48(a0)      /* quantptr[DCTSIZE*3] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s7, 16(a0)      /* quantptr[DCTSIZE*1] */
+    lh          t8, 16(t0)      /* inptr[DCTSIZE*1] */
+    subu        t6, t6, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 48(t0)      /* inptr[DCTSIZE*3] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         v0, s5, v0      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, t6      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, t6      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, v0, 16, 16
+    ins         t7, t8, 16, 16
+    addiu       t9, t9, -1
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        s4, $ac0
+    mflo        s5, $ac1
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addiu       t0, t0, 2
+    addu        t6, t4, s4
+    subu        t5, t4, s4
+    addu        s6, t3, s5
+    subu        s7, t3, s5
+    shra_r.w    t6, t6, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    t5, t5, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          t6, 28(t1)
+    sw          t5, 60(t1)
+    sw          s6, -4(t1)
+    bgtz        t9, 0b
+     sw         s7, 92(t1)
+    /* second loop three pass */
+    li          t9, 3
+1:
+    lh          s6, 34(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 34(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 98(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 98(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 2(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 2(a0)       /* quantptr[DCTSIZE*0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         v0, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 114(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 114(a0)     /* quantptr[DCTSIZE*7] */
+    lh          s5, 82(a0)      /* quantptr[DCTSIZE*5] */
+    lh          t6, 82(t0)      /* inptr[DCTSIZE*5] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s6, 50(a0)      /* quantptr[DCTSIZE*3] */
+    lh          t8, 18(t0)      /* inptr[DCTSIZE*1] */
+    subu        v0, v0, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 50(t0)      /* inptr[DCTSIZE*3] */
+    lh          s7, 18(a0)      /* quantptr[DCTSIZE*1] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         t6, s5, t6      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, v0      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, v0      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        t5, $ac0
+    mflo        t6, $ac1
+    addiu       t9, t9, -1
+    addiu       t0, t0, 2
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addu        s5, t4, t5
+    subu        s4, t4, t5
+    addu        s6, t3, t6
+    subu        s7, t3, t6
+    shra_r.w    s5, s5, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    s4, s4, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          s5, 32(t1)
+    sw          s4, 64(t1)
+    sw          s6, 0(t1)
+    bgtz        t9, 1b
+     sw         s7, 96(t1)
+    move        t1, v1
+    li          s4, 15137
+    lw          s6, 8(t1)       /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 24(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 0(t1)       /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 28(t1)      /* wsptr[7] */
+    lh          t6, 20(t1)      /* wsptr[5] */
+    lh          t7, 12(t1)      /* wsptr[3] */
+    lh          t8, 4(t1)       /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 0(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 2 */
+    li          s4, 15137
+    lw          s6, 40(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 56(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 32(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 60(t1)      /* wsptr[7] */
+    lh          t6, 52(t1)      /* wsptr[5] */
+    lh          t7, 44(t1)      /* wsptr[3] */
+    lh          t8, 36(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    sll         s4, t9, 2
+    lw          v0, 4(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 3 */
+    li          s4, 15137
+    lw          s6, 72(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 88(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 64(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 92(t1)      /* wsptr[7] */
+    lh          t6, 84(t1)      /* wsptr[5] */
+    lh          t7, 76(t1)      /* wsptr[3] */
+    lh          t8, 68(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 8(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    li          s4, 15137
+    lw          s6, 104(t1)     /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 120(t1)     /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 96(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 124(t1)     /* wsptr[7] */
+    lh          t6, 116(t1)     /* wsptr[5] */
+    lh          t7, 108(t1)     /* wsptr[3] */
+    lh          t8, 100(t1)     /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2; */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2; */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 12(a2)      /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -144
+    move        v0, sp
+    addiu       v1, v0, 24
+    addiu       t9, zero, 5793
+    addiu       s0, zero, 10033
+    addiu       s1, zero, 2998
+
+1:
+    lh          s2, 0(a0)       /* q0 = quantptr[ 0] */
+    lh          s3, 32(a0)      /* q1 = quantptr[16] */
+    lh          s4, 64(a0)      /* q2 = quantptr[32] */
+    lh          t2, 64(a1)      /* tmp2 = inptr[32] */
+    lh          t1, 32(a1)      /* tmp1 = inptr[16] */
+    lh          t0, 0(a1)       /* tmp0 = inptr[ 0] */
+    mul         t2, t2, s4      /* tmp2 = tmp2 * q2 */
+    mul         t1, t1, s3      /* tmp1 = tmp1 * q1 */
+    mul         t0, t0, s2      /* tmp0 = tmp0 * q0 */
+    lh          t6, 16(a1)      /* z1 = inptr[ 8] */
+    lh          t8, 80(a1)      /* z3 = inptr[40] */
+    lh          t7, 48(a1)      /* z2 = inptr[24] */
+    lh          s2, 16(a0)      /* q0 = quantptr[ 8] */
+    lh          s4, 80(a0)      /* q2 = quantptr[40] */
+    lh          s3, 48(a0)      /* q1 = quantptr[24] */
+    mul         t2, t2, t9      /* tmp2 = tmp2 * 5793 */
+    mul         t1, t1, s0      /* tmp1 = tmp1 * 10033 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    mul         t6, t6, s2      /* z1 = z1 * q0 */
+    mul         t8, t8, s4      /* z3 = z3 * q2 */
+    mul         t7, t7, s3      /* z2 = z2 * q1 */
+    addu        t3, t0, t2      /* tmp10 = tmp0 + tmp2 */
+    sll         t2, t2, 1       /* tmp2 = tmp2 << 2 */
+    subu        t4, t0, t2      /* tmp11 = tmp0 - tmp2; */
+    subu        t5, t3, t1      /* tmp12 = tmp10 - tmp1 */
+    addu        t3, t3, t1      /* tmp10 = tmp10 + tmp1 */
+    addu        t1, t6, t8      /* tmp1 = z1 + z3 */
+    mul         t1, t1, s1      /* tmp1 = tmp1 * 2998 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    subu        t2, t6, t8      /* tmp2 = z1 - z3 */
+    subu        t2, t2, t7      /* tmp2 = tmp2 - z2 */
+    sll         t2, t2, 2       /* tmp2 = tmp2 << 2 */
+    addu        t0, t6, t7      /* tmp0 = z1 + z2 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    subu        s2, t8, t7      /* q0 = z3 - z2 */
+    sll         s2, s2, 13      /* q0 = q0 << 13 */
+    addu        t0, t0, t1      /* tmp0 = tmp0 + tmp1 */
+    addu        t1, s2, t1      /* tmp1 = q0 + tmp1 */
+    addu        s2, t4, t2      /* q0 = tmp11 + tmp2 */
+    subu        s3, t4, t2      /* q1 = tmp11 - tmp2 */
+    addu        t6, t3, t0      /* z1 = tmp10 + tmp0 */
+    subu        t7, t3, t0      /* z2 = tmp10 - tmp0 */
+    addu        t4, t5, t1      /* tmp11 = tmp12 + tmp1 */
+    subu        t5, t5, t1      /* tmp12 = tmp12 - tmp1 */
+    shra_r.w    t6, t6, 11      /* z1 = (z1 + 1024) >> 11 */
+    shra_r.w    t7, t7, 11      /* z2 = (z2 + 1024) >> 11 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    shra_r.w    t5, t5, 11      /* tmp12 = (tmp12 + 1024) >> 11 */
+    sw          s2, 24(v0)
+    sw          s3, 96(v0)
+    sw          t6, 0(v0)
+    sw          t7, 120(v0)
+    sw          t4, 48(v0)
+    sw          t5, 72(v0)
+    addiu       v0, v0, 4
+    addiu       a1, a1, 2
+    bne         v0, v1, 1b
+     addiu      a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move        v0, sp
+    addiu       v1, v0, 144
+
+2:
+    lw          t0, 0(v0)
+    lw          t2, 16(v0)
+    lw          s5, 0(a2)
+    addiu       t0, t0, 16
+    sll         t0, t0, 13
+    mul         t3, t2, t9
+    lw          t6, 4(v0)
+    lw          t8, 20(v0)
+    lw          t7, 12(v0)
+    addu        s5, s5, a3
+    addu        s6, t6, t8
+    mul         s6, s6, s1
+    addu        t1, t0, t3
+    subu        t4, t0, t3
+    subu        t4, t4, t3
+    lw          t3, 8(v0)
+    mul         t0, t3, s0
+    addu        s7, t6, t7
+    sll         s7, s7, 13
+    addu        s7, s6, s7
+    subu        t2, t8, t7
+    sll         t2, t2, 13
+    addu        t2, s6, t2
+    subu        s6, t6, t7
+    subu        s6, s6, t8
+    sll         s6, s6, 13
+    addu        t3, t1, t0
+    subu        t5, t1, t0
+    addu        t6, t3, s7
+    subu        t3, t3, s7
+    addu        t7, t4, s6
+    subu        t4, t4, s6
+    addu        t8, t5, t2
+    subu        t5, t5, t2
+    shll_s.w    t6, t6, 6
+    shll_s.w    t3, t3, 6
+    shll_s.w    t7, t7, 6
+    shll_s.w    t4, t4, 6
+    shll_s.w    t8, t8, 6
+    shll_s.w    t5, t5, 6
+    sra         t6, t6, 24
+    addiu       t6, t6, 128
+    sra         t3, t3, 24
+    addiu       t3, t3, 128
+    sb          t6, 0(s5)
+    sra         t7, t7, 24
+    addiu       t7, t7, 128
+    sb          t3, 5(s5)
+    sra         t4, t4, 24
+    addiu       t4, t4, 128
+    sb          t7, 1(s5)
+    sra         t8, t8, 24
+    addiu       t8, t8, 128
+    sb          t4, 4(s5)
+    addiu       v0, v0, 24
+    sra         t5, t5, 24
+    addiu       t5, t5, 128
+    sb          t8, 2(s5)
+    addiu       a2, a2,  4
+    bne         v0, v1, 2b
+     sb         t5, 3(s5)
+
+    addiu       sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 8
+
+1:
+    /* odd part */
+    lh          t0, 48(a1)
+    lh          t1, 48(a0)
+    lh          t2, 16(a1)
+    lh          t3, 16(a0)
+    lh          t4, 80(a1)
+    lh          t5, 80(a0)
+    lh          t6, 112(a1)
+    lh          t7, 112(a0)
+    mul         t0, t0, t1      /* z2 */
+    mul         t1, t2, t3      /* z1 */
+    mul         t2, t4, t5      /* z3 */
+    mul         t3, t6, t7      /* z4 */
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    li          t6, 7053        /* FIX(0.860918669) */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t7, t1, t2      /* tmp10 */
+    addu        t8, t7, t3      /* tmp10 + z4 */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2139        /* FIX(0.261052384) */
+    mul         t8, t7, t8      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    li          t7, 2295        /* FIX(0.280143716) */
+    mul         t7, t1, t7      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1-=z4 */
+    subu        t0, t0, t2      /* z2-=z3 */
+    addu        t2, t0, t1      /* z1+z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_0_765366865 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t8, t6, t8      /* tmp12 */
+    addu        t3, t8, t4      /* tmp12 + tmp11 */
+    addu        t3, t3, t7      /* tmp10 */
+    subu        t8, t8, t9      /* tmp12 + tmp13 */
+    addu        s0, t5, s0
+    subu        t8, t8, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        s1, s1, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    /* even part start */
+    lh          t4, 64(a1)
+    lh          t5, 64(a0)
+    lh          t7, 32(a1)
+    lh          s0, 32(a0)
+    lh          s1, 0(a1)
+    lh          s2, 0(a0)
+    lh          s3, 96(a1)
+    lh          v0, 96(a0)
+    mul         t4, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                              quantptr[DCTSIZE*4]) */
+    mul         t5, t7, s0      /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                              quantptr[DCTSIZE*2]) */
+    mul         t7, s1, s2      /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                              quantptr[DCTSIZE*0]) */
+    mul         s0, s3, v0      /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                              quantptr[DCTSIZE*6]) */
+    /* odd part end */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* update counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 2
+    addiu       a1, a1, 2
+    /* even part rest */
+    li          s1, 10033
+    li          s2, 11190
+    mul         t4, t4, s1      /* z4 */
+    mul         s1, t5, s2      /* z4 */
+    sll         t5, t5, 13      /* z1 */
+    sll         t7, t7, 13
+    addiu       t7, t7, 1024    /* z3 */
+    sll         s0, s0, 13      /* z2 */
+    addu        s2, t7, t4      /* tmp10 */
+    subu        t4, t7, t4      /* tmp11 */
+    subu        s3, t5, s0      /* tmp12 */
+    addu        t2, t7, s3      /* tmp21 */
+    subu        s3, t7, s3      /* tmp24 */
+    addu        t7, s1, s0      /* tmp12 */
+    addu        v0, s2, t7      /* tmp20 */
+    subu        s2, s2, t7      /* tmp25 */
+    subu        s1, s1, t5      /* z4 - z1 */
+    subu        s1, s1, s0      /* tmp12 */
+    addu        s0, t4, s1      /* tmp22 */
+    subu        t4, t4, s1      /* tmp23 */
+    /* final output stage */
+    addu        t5, v0, t3
+    subu        v0, v0, t3
+    addu        t3, t2, t1
+    subu        t2, t2, t1
+    addu        t1, s0, t8
+    subu        s0, s0, t8
+    addu        t8, t4, t9
+    subu        t4, t4, t9
+    addu        t9, s3, t0
+    subu        s3, s3, t0
+    addu        t0, s2, t6
+    subu        s2, s2, t6
+    sra         t5, t5, 11
+    sra         t3, t3, 11
+    sra         t1, t1, 11
+    sra         t8, t8, 11
+    sra         t9, t9, 11
+    sra         t0, t0, 11
+    sra         s2, s2, 11
+    sra         s3, s3, 11
+    sra         t4, t4, 11
+    sra         s0, s0, 11
+    sra         t2, t2, 11
+    sra         v0, v0, 11
+    sw          t5, 0(a2)
+    sw          t3, 32(a2)
+    sw          t1, 64(a2)
+    sw          t8, 96(a2)
+    sw          t9, 128(a2)
+    sw          t0, 160(a2)
+    sw          s2, 192(a2)
+    sw          s3, 224(a2)
+    sw          t4, 256(a2)
+    sw          s0, 288(a2)
+    sw          t2, 320(a2)
+    sw          v0, 352(a2)
+    bgtz        a3, 1b
+     addiu      a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 12
+
+1:
+    /* Odd part */
+    lw          t0, 12(a0)
+    lw          t1, 4(a0)
+    lw          t2, 20(a0)
+    lw          t3, 28(a0)
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t6, t1, t2      /* tmp10 */
+    li          t7, 2139        /* FIX(0.261052384) */
+    mul         t7, t6, t7      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    addu        t6, t6, t3      /* tmp10 + z4 */
+    li          t8, 7053        /* FIX(0.860918669) */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2295        /* FIX(0.280143716) */
+    mul         t8, t1, t8      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242)) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1 -= z4 */
+    subu        t0, t0, t2      /* z2 -= z3 */
+    addu        t2, t1, t0      /* z1 + z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_1_847759065 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t3, t6, t7      /* tmp12 */
+    addu        t7, t3, t4
+    addu        t7, t7, t8      /* tmp10 */
+    subu        t3, t3, t9
+    subu        t3, t3, t5
+    subu        t3, t3, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        t9, t9, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* even part */
+    lw          t2, 16(a0)      /* z4 */
+    lw          t4, 8(a0)       /* z1 */
+    lw          t5, 0(a0)       /* z3 */
+    lw          t8, 24(a0)      /* z2 */
+    li          s0, 10033       /* FIX(1.224744871) */
+    li          s1, 11190       /* FIX(1.366025404) */
+    mul         t2, t2, s0      /* z4 */
+    mul         s0, t4, s1      /* z4 */
+    addiu       t5, t5, 0x10
+    sll         t5, t5, 13      /* z3 */
+    sll         t4, t4, 13      /* z1 */
+    sll         t8, t8, 13      /* z2 */
+    subu        s1, t4, t8      /* tmp12 */
+    addu        s2, t5, t2      /* tmp10 */
+    subu        t2, t5, t2      /* tmp11 */
+    addu        s3, t5, s1      /* tmp21 */
+    subu        s1, t5, s1      /* tmp24 */
+    addu        t5, s0, t8      /* tmp12 */
+    addu        v0, s2, t5      /* tmp20 */
+    subu        t5, s2, t5      /* tmp25 */
+    subu        t4, s0, t4
+    subu        t4, t4, t8      /* tmp12 */
+    addu        t8, t2, t4      /* tmp22 */
+    subu        t2, t2, t4      /* tmp23 */
+    /* increment counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 32
+    /* Final stage */
+    addu        t4, v0, t7
+    subu        v0, v0, t7
+    addu        t7, s3, t1
+    subu        s3, s3, t1
+    addu        t1, t8, t3
+    subu        t8, t8, t3
+    addu        t3, t2, t9
+    subu        t2, t2, t9
+    addu        t9, s1, t0
+    subu        s1, s1, t0
+    addu        t0, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 4
+    sll         t7, t7, 4
+    sll         t1, t1, 4
+    sll         t3, t3, 4
+    sll         t9, t9, 4
+    sll         t0, t0, 4
+    sll         t5, t5, 4
+    sll         s1, s1, 4
+    sll         t2, t2, 4
+    sll         t8, t8, 4
+    sll         s3, s3, 4
+    sll         v0, v0, 4
+    shll_s.w    t4, t4, 2
+    shll_s.w    t7, t7, 2
+    shll_s.w    t1, t1, 2
+    shll_s.w    t3, t3, 2
+    shll_s.w    t9, t9, 2
+    shll_s.w    t0, t0, 2
+    shll_s.w    t5, t5, 2
+    shll_s.w    s1, s1, 2
+    shll_s.w    t2, t2, 2
+    shll_s.w    t8, t8, 2
+    shll_s.w    s3, s3, 2
+    shll_s.w    v0, v0, 2
+    srl         t4, t4, 24
+    srl         t7, t7, 24
+    srl         t1, t1, 24
+    srl         t3, t3, 24
+    srl         t9, t9, 24
+    srl         t0, t0, 24
+    srl         t5, t5, 24
+    srl         s1, s1, 24
+    srl         t2, t2, 24
+    srl         t8, t8, 24
+    srl         s3, s3, 24
+    srl         v0, v0, 24
+    lw          t6, 0(a1)
+    addiu       t4, t4, 0x80
+    addiu       t7, t7, 0x80
+    addiu       t1, t1, 0x80
+    addiu       t3, t3, 0x80
+    addiu       t9, t9, 0x80
+    addiu       t0, t0, 0x80
+    addiu       t5, t5, 0x80
+    addiu       s1, s1, 0x80
+    addiu       t2, t2, 0x80
+    addiu       t8, t8, 0x80
+    addiu       s3, s3, 0x80
+    addiu       v0, v0, 0x80
+    sb          t4, 0(t6)
+    sb          t7, 1(t6)
+    sb          t1, 2(t6)
+    sb          t3, 3(t6)
+    sb          t9, 4(t6)
+    sb          t0, 5(t6)
+    sb          t5, 6(t6)
+    sb          s1, 7(t6)
+    sb          t2, 8(t6)
+    sb          t8, 9(t6)
+    sb          s3, 10(t6)
+    sb          v0, 11(t6)
+    bgtz        a3, 1b
+     addiu      a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr          ra
+     nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    lw            t0, 0(a0)
+    li            t7, 0xff80ff80
+    addu          t0, t0, a1
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    lw            t0, 4(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 0(a2)
+    usw           t4, 4(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 8(a2)
+    usw           t6, 12(a2)
+
+    lw            t0, 8(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 16(a2)
+    usw           t4, 20(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 24(a2)
+    usw           t6, 28(a2)
+
+    lw            t0, 12(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 32(a2)
+    usw           t4, 36(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 40(a2)
+    usw           t6, 44(a2)
+
+    lw            t0, 16(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 48(a2)
+    usw           t4, 52(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 56(a2)
+    usw           t6, 60(a2)
+
+    lw            t0, 20(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 64(a2)
+    usw           t4, 68(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 72(a2)
+    usw           t6, 76(a2)
+
+    lw            t0, 24(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 80(a2)
+    usw           t4, 84(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 88(a2)
+    usw           t6, 92(a2)
+
+    lw            t0, 28(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 96(a2)
+    usw           t4, 100(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 104(a2)
+    usw           t6, 108(a2)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 112(a2)
+    usw           t4, 116(a2)
+    usw           t5, 120(a2)
+    usw           t6, 124(a2)
+
+    j             ra
+     nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    .set at
+
+    lw          t0, 0(a0)
+    addu        t0, t0, a1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 4(a0)
+    swc1        f2, 0(a2)
+    swc1        f4, 4(a2)
+    swc1        f6, 8(a2)
+    addu        t0, t0, a1
+    swc1        f8, 12(a2)
+    swc1        f10, 16(a2)
+    swc1        f12, 20(a2)
+    swc1        f14, 24(a2)
+    swc1        f16, 28(a2)
+    /* elemr 1 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 8(a0)
+    swc1        f2, 32(a2)
+    swc1        f4, 36(a2)
+    swc1        f6, 40(a2)
+    addu        t0, t0, a1
+    swc1        f8, 44(a2)
+    swc1        f10, 48(a2)
+    swc1        f12, 52(a2)
+    swc1        f14, 56(a2)
+    swc1        f16, 60(a2)
+    /* elemr 2 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 12(a0)
+    swc1        f2, 64(a2)
+    swc1        f4, 68(a2)
+    swc1        f6, 72(a2)
+    addu        t0, t0, a1
+    swc1        f8, 76(a2)
+    swc1        f10, 80(a2)
+    swc1        f12, 84(a2)
+    swc1        f14, 88(a2)
+    swc1        f16, 92(a2)
+    /*  elemr 3 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 16(a0)
+    swc1        f2, 96(a2)
+    swc1        f4, 100(a2)
+    swc1        f6, 104(a2)
+    addu        t0, t0, a1
+    swc1        f8, 108(a2)
+    swc1        f10, 112(a2)
+    swc1        f12, 116(a2)
+    swc1        f14, 120(a2)
+    swc1        f16, 124(a2)
+    /* elemr 4 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 20(a0)
+    swc1        f2, 128(a2)
+    swc1        f4, 132(a2)
+    swc1        f6, 136(a2)
+    addu        t0, t0, a1
+    swc1        f8, 140(a2)
+    swc1        f10, 144(a2)
+    swc1        f12, 148(a2)
+    swc1        f14, 152(a2)
+    swc1        f16, 156(a2)
+    /* elemr 5 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 24(a0)
+    swc1        f2, 160(a2)
+    swc1        f4, 164(a2)
+    swc1        f6, 168(a2)
+    addu        t0, t0, a1
+    swc1        f8, 172(a2)
+    swc1        f10, 176(a2)
+    swc1        f12, 180(a2)
+    swc1        f14, 184(a2)
+    swc1        f16, 188(a2)
+    /* elemr 6 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 28(a0)
+    swc1        f2, 192(a2)
+    swc1        f4, 196(a2)
+    swc1        f6, 200(a2)
+    addu        t0, t0, a1
+    swc1        f8, 204(a2)
+    swc1        f10, 208(a2)
+    swc1        f12, 212(a2)
+    swc1        f14, 216(a2)
+    swc1        f16, 220(a2)
+    /* elemr 7 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    swc1        f2, 224(a2)
+    swc1        f4, 228(a2)
+    swc1        f6, 232(a2)
+    swc1        f8, 236(a2)
+    swc1        f10, 240(a2)
+    swc1        f12, 244(a2)
+    swc1        f14, 248(a2)
+    swc1        f16, 252(a2)
+
+    j           ra
+     nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero  $0
+#define AT    $1
+#define v0    $2
+#define v1    $3
+#define a0    $4
+#define a1    $5
+#define a2    $6
+#define a3    $7
+#define t0    $8
+#define t1    $9
+#define t2    $10
+#define t3    $11
+#define t4    $12
+#define t5    $13
+#define t6    $14
+#define t7    $15
+#define s0    $16
+#define s1    $17
+#define s2    $18
+#define s3    $19
+#define s4    $20
+#define s5    $21
+#define s6    $22
+#define s7    $23
+#define t8    $24
+#define t9    $25
+#define k0    $26
+#define k1    $27
+#define gp    $28
+#define sp    $29
+#define fp    $30
+#define s8    $30
+#define ra    $31
+
+#define f0    $f0
+#define f1    $f1
+#define f2    $f2
+#define f3    $f3
+#define f4    $f4
+#define f5    $f5
+#define f6    $f6
+#define f7    $f7
+#define f8    $f8
+#define f9    $f9
+#define f10   $f10
+#define f11   $f11
+#define f12   $f12
+#define f13   $f13
+#define f14   $f14
+#define f15   $f15
+#define f16   $f16
+#define f17   $f17
+#define f18   $f18
+#define f19   $f19
+#define f20   $f20
+#define f21   $f21
+#define f22   $f22
+#define f23   $f23
+#define f24   $f24
+#define f25   $f25
+#define f26   $f26
+#define f27   $f27
+#define f28   $f28
+#define f29   $f29
+#define f30   $f30
+#define f31   $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+    .globl      symbol; \
+    HIDDEN_SYMBOL(symbol) \
+    .align      2; \
+    .type       symbol, @function; \
+    .ent        symbol, 0; \
+symbol: \
+    .frame      sp, 0, ra; \
+    .set        push; \
+    .set        arch = mips32r2; \
+    .set        noreorder; \
+    .set        noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+    .set        dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+    .set        pop; \
+    .end        function; \
+    .size       function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
+                           r2  = 0, r3  = 0, r4  = 0, \
+                           r5  = 0, r6  = 0, r7  = 0, \
+                           r8  = 0, r9  = 0, r10 = 0, \
+                           r11 = 0, r12 = 0, r13 = 0, \
+                           r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, -\stack_offset
+.endif
+    sw          \r1, 0(sp)
+.if \r2 != 0
+    sw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    sw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    sw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw          \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
+                                r2  = 0, r3  = 0, r4  = 0, \
+                                r5  = 0, r6  = 0, r7  = 0, \
+                                r8  = 0, r9  = 0, r10 = 0, \
+                                r11 = 0, r12 = 0, r13 = 0, \
+                                r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+    lw          \r1, 0(sp)
+.if \r2 != 0
+    lw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    lw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    lw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw          \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, \stack_offset
+.endif
+.endm
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081  ((short)5329)                /* FIX(0.08131) */
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_168  ((short)11059)               /* FIX(0.16874) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_331  ((short)21709)               /* FIX(0.33126) */
+#define F_0_418  ((short)27439)               /* FIX(0.41869) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250,
+  index_PW_MF016_MF033,
+  index_PW_MF008_MF041,
+  index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+  _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+  _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+  _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+                   ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033   get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041   get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ  get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298  ((short)2446)   /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)   /* FIX(0.390180644) */
+#define FIX_0_541  ((short)4433)   /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)   /* FIX(0.765366865) */
+#define FIX_0_899  ((short)7373)   /* FIX(0.899976223) */
+#define FIX_1_175  ((short)9633)   /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299)  /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137)  /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069)  /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819)  /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995)  /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172)  /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+                   (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X  get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+  __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+  __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * out2 = z1 + tmp13 * 0.765366865; \
+   * out6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+  \
+  out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+  out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+  out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+  out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_pi16(tmp4, tmp6); \
+  z4 = _mm_add_pi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * out7 = tmp4 + z1 + z3;  out5 = tmp5 + z2 + z4; \
+   * out3 = tmp6 + z2 + z3;  out1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * out7 = tmp4 + z3;  out5 = tmp5 + z4; \
+   * out3 = tmp6 + z3;  out1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+  \
+  tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+  tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+  tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+  tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+  \
+  out7l = _mm_add_pi32(tmp4l, z3l); \
+  out7h = _mm_add_pi32(tmp4h, z3h); \
+  out1l = _mm_add_pi32(tmp7l, z4l); \
+  out1h = _mm_add_pi32(tmp7h, z4h); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+  \
+  tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+  tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+  tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+  tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+  \
+  out5l = _mm_add_pi32(tmp5l, z4l); \
+  out5h = _mm_add_pi32(tmp5h, z4h); \
+  out3l = _mm_add_pi32(tmp6l, z3l); \
+  out3h = _mm_add_pi32(tmp6h, z3h); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  __m64 tmp10, tmp11; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+  out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  __m64 tmp10, tmp11; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  \
+  out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+  out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp12, tmp13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE  128
+
+#define FIX_0_298  ((short)2446)  /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)  /* FIX(0.390180644) */
+#define FIX_0_899  ((short)7373)  /* FIX(0.899976223) */
+#define FIX_0_541  ((short)4433)  /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)  /* FIX(0.765366865) */
+#define FIX_1_175  ((short)9633)  /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+                  CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+  __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+  \
+  z3 = _mm_add_pi16(tmp0, tmp2); \
+  z4 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+  tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+  \
+  tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+  tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+  tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+  tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+  \
+  tmp0l = _mm_add_pi32(tmp0l, z3l); \
+  tmp0h = _mm_add_pi32(tmp0h, z3h); \
+  tmp3l = _mm_add_pi32(tmp3l, z4l); \
+  tmp3h = _mm_add_pi32(tmp3h, z4h); \
+  \
+  tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+  tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+  \
+  tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+  tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+  tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+  tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+  \
+  tmp1l = _mm_add_pi32(tmp1l, z4l); \
+  tmp1h = _mm_add_pi32(tmp1h, z4h); \
+  tmp2l = _mm_add_pi32(tmp2l, z3l); \
+  tmp2h = _mm_add_pi32(tmp2h, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_pi32(tmp10l, tmp3l); \
+  out0h = _mm_add_pi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+  out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+  out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_pi32(out0l, out0h); \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  \
+  out1l = _mm_add_pi32(tmp11l, tmp2l); \
+  out1h = _mm_add_pi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  out2l = _mm_add_pi32(tmp12l, tmp1l); \
+  out2h = _mm_add_pi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  \
+  out3l = _mm_add_pi32(tmp13l, tmp0l); \
+  out3h = _mm_add_pi32(tmp13h, tmp0h); \
+  \
+  out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+  out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+  out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+  out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 z23, z2, z3, z23l, z23h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l); \
+      dcval = _mm_slli_pi16(dcval, PASS1_BITS);  /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  z2 = _mm_mullo_pi16(col2l, quant2l); \
+  z3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  z23l = _mm_unpacklo_pi16(z2, z3); \
+  z23h = _mm_unpackhi_pi16(z2, z3); \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z2 = _mm_mullo_pi16(col0l, quant0l); \
+  z3 = _mm_mullo_pi16(col4l, quant4l); \
+  \
+  z23 = _mm_add_pi16(z2, z3); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(z2, z3); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+  tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+  \
+  DO_IDCT_COMMON(1) \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 z23, z23l, z23h; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+  __m64 col0, col1, col2, col3; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  z23l = _mm_unpacklo_pi16(row2l, row6l); \
+  z23h = _mm_unpackhi_pi16(row2l, row6l); \
+  \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z23 = _mm_add_pi16(row0l, row4l); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(row0l, row4l); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  tmp0 = row7l; \
+  tmp1 = row5l; \
+  tmp2 = row3l; \
+  tmp3 = row1l; \
+  \
+  DO_IDCT_COMMON(2) \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..e8f1af562b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
@@ -0,0 +1,870 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
+  simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_ycc_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_ycc_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_ycc_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_ycc_extrgb_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_ycc_extrgbx_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_ycc_extbgr_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_ycc_extbgrx_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_ycc_extxbgr_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_ycc_extxrgb_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_ycc_rgb_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+                            compptr->v_samp_factor, compptr->width_in_blocks,
+                            input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
+
+#define SIZEOF_MMWORD  8
+#define BYTE_BIT  8
+#define WORD_BIT  16
+#define SCALEBITS  16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+  (((uint64_t)(uint8_t)a << 56) | \
+   ((uint64_t)(uint8_t)b << 48) | \
+   ((uint64_t)(uint8_t)c << 40) | \
+   ((uint64_t)(uint8_t)d << 32) | \
+   ((uint64_t)(uint8_t)e << 24) | \
+   ((uint64_t)(uint8_t)f << 16) | \
+   ((uint64_t)(uint8_t)g << 8)  | \
+   ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+  (((uint64_t)(uint16_t)a << 48) | \
+   ((uint64_t)(uint16_t)b << 32) | \
+   ((uint64_t)(uint16_t)c << 16) | \
+   ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+  (((uint64_t)(uint32_t)a << 32) | \
+   ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h b/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+   load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+  return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__b6 << 24) |
+                ((uint32_t)__b4 << 16) |
+                ((uint32_t)__b2 << 8) |
+                (uint32_t)__b0;
+  uint32_t hi = ((uint32_t)__b7 << 24) |
+                ((uint32_t)__b5 << 16) |
+                ((uint32_t)__b3 << 8) |
+                (uint32_t)__b1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklbh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklhw %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  } else if (__i1 == __i0) {
+    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+    __m64 ret;
+
+    asm("pshufh %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+       );
+
+    return ret;
+  } else {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+  __m64 ret;
+
+  asm("sll    $8, %1, 8\n\t"
+      "or     %1, %1, $8\n\t"
+      "mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__b0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+  __m64 ret;
+
+  asm("mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__h0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+  return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+  return _mm_set_pi8(__h7, __h6, __h5, __h4,
+                     __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+  return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaddhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+  int ret;
+
+  asm("pmovmskb %0, %1\n\t"
+      : "=r" (ret)
+      : "y" (__m1)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhuh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmullh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmuluw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psadbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pasubub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("biadd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("and %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("andn %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+  __m32 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("xor %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllh  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsll  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsrl  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrah %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psraw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsra %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+  return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+  return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmplth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsshb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packushb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+  __m64 ret;
+
+  asm("pextrh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__pos)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+  __m64 ret;
+
+  switch (__pos) {
+  case 0:
+
+    asm("pinsrh_0 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 1:
+
+    asm("pinsrh_1 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  case 2:
+
+    asm("pinsrh_2 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 3:
+
+    asm("pinsrh_3 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  }
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+  __m64 ret;
+
+  asm("pshufh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__n)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+   which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+   datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+  src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+  asm("swc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+  asm("sdc1 %1, %0 \n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+  __m32 ret;
+
+  asm("lwc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("ldc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif  /* __LOONGSON_MMINTRIN_H__ */
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA  mm0
+%define mmB  mm1
+%define xmmA  xmm0
+%define xmmB  xmm1
+%define ymmA  ymm0
+%define ymmB  ymm1
+%elif RGB_GREEN == 0
+%define mmA  mm2
+%define mmB  mm3
+%define xmmA  xmm2
+%define xmmB  xmm3
+%define ymmA  ymm2
+%define ymmB  ymm3
+%elif RGB_BLUE == 0
+%define mmA  mm4
+%define mmB  mm5
+%define xmmA  xmm4
+%define xmmB  xmm5
+%define ymmA  ymm4
+%define ymmB  ymm5
+%else
+%define mmA  mm6
+%define mmB  mm7
+%define xmmA  xmm6
+%define xmmB  xmm7
+%define ymmA  ymm6
+%define ymmB  ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC  mm0
+%define mmD  mm1
+%define xmmC  xmm0
+%define xmmD  xmm1
+%define ymmC  ymm0
+%define ymmD  ymm1
+%elif RGB_GREEN == 1
+%define mmC  mm2
+%define mmD  mm3
+%define xmmC  xmm2
+%define xmmD  xmm3
+%define ymmC  ymm2
+%define ymmD  ymm3
+%elif RGB_BLUE == 1
+%define mmC  mm4
+%define mmD  mm5
+%define xmmC  xmm4
+%define xmmD  xmm5
+%define ymmC  ymm4
+%define ymmD  ymm5
+%else
+%define mmC  mm6
+%define mmD  mm7
+%define xmmC  xmm6
+%define xmmD  xmm7
+%define ymmC  ymm6
+%define ymmD  ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE  mm0
+%define mmF  mm1
+%define xmmE  xmm0
+%define xmmF  xmm1
+%define ymmE  ymm0
+%define ymmF  ymm1
+%elif RGB_GREEN == 2
+%define mmE  mm2
+%define mmF  mm3
+%define xmmE  xmm2
+%define xmmF  xmm3
+%define ymmE  ymm2
+%define ymmF  ymm3
+%elif RGB_BLUE == 2
+%define mmE  mm4
+%define mmF  mm5
+%define xmmE  xmm4
+%define xmmF  xmm5
+%define ymmE  ymm4
+%define ymmF  ymm5
+%else
+%define mmE  mm6
+%define mmF  mm7
+%define xmmE  xmm6
+%define xmmF  xmm7
+%define ymmE  ymm6
+%define ymmF  ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG  mm0
+%define mmH  mm1
+%define xmmG  xmm0
+%define xmmH  xmm1
+%define ymmG  ymm0
+%define ymmH  ymm1
+%elif RGB_GREEN == 3
+%define mmG  mm2
+%define mmH  mm3
+%define xmmG  xmm2
+%define xmmH  xmm3
+%define ymmG  ymm2
+%define ymmH  ymm3
+%elif RGB_BLUE == 3
+%define mmG  mm4
+%define mmH  mm5
+%define xmmG  xmm4
+%define xmmH  xmm5
+%define ymmG  ymm4
+%define ymmH  ymm5
+%else
+%define mmG  mm6
+%define mmH  mm7
+%define xmmG  xmm6
+%define xmmH  xmm7
+%define ymmG  ymm6
+%define ymmH  ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc
new file mode 100644
index 0000000000..9192f66f0c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jdct.inc
@@ -0,0 +1,31 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+%define ROW(n, b, s)  ((b) + (n) * (s))
+%define COL(n, b, s)  ((b) + (n) * (s) * DCTSIZE)
+
+%define DWBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc
new file mode 100644
index 0000000000..667024a5f9
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc
@@ -0,0 +1,93 @@
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+;
+; -- jpeglib.h
+;
+%define DCTSIZE 8
+%define DCTSIZE2 64
+;
+; -- jmorecfg.h
+;
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define EXT_RGB_RED 0
+%define EXT_RGB_GREEN 1
+%define EXT_RGB_BLUE 2
+%define EXT_RGB_PIXELSIZE 3
+%define EXT_RGBX_RED 0
+%define EXT_RGBX_GREEN 1
+%define EXT_RGBX_BLUE 2
+%define EXT_RGBX_PIXELSIZE 4
+%define EXT_BGR_RED 2
+%define EXT_BGR_GREEN 1
+%define EXT_BGR_BLUE 0
+%define EXT_BGR_PIXELSIZE 3
+%define EXT_BGRX_RED 2
+%define EXT_BGRX_GREEN 1
+%define EXT_BGRX_BLUE 0
+%define EXT_BGRX_PIXELSIZE 4
+%define EXT_XBGR_RED 3
+%define EXT_XBGR_GREEN 2
+%define EXT_XBGR_BLUE 1
+%define EXT_XBGR_PIXELSIZE 4
+%define EXT_XRGB_RED 1
+%define EXT_XRGB_GREEN 2
+%define EXT_XRGB_BLUE 3
+%define EXT_XRGB_PIXELSIZE 4
+%define RGBX_FILLER_0XFF 1
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define CENTERJSAMPLE 128
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+;
+; -- jdct.h
+;
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define float FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
+; To maximize parallelism, Type short is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+;
+; -- jsimd.h
+;
+%define JSIMD_NONE 0x00
+%define JSIMD_MMX 0x01
+%define JSIMD_3DNOW 0x02
+%define JSIMD_SSE 0x04
+%define JSIMD_SSE2 0x08
+%define JSIMD_AVX2 0x80
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE   DCTSIZE
+%define _cpp_protection_DCTSIZE2  DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED             RGB_RED
+%define _cpp_protection_RGB_GREEN           RGB_GREEN
+%define _cpp_protection_RGB_BLUE            RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE       RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED         EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN       EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE        EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE   EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED        EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN      EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE       EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE  EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED         EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN       EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE        EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE   EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED        EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN      EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE       EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE  EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED        EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN      EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE       EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE  EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED        EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN      EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE       EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF  1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE            byte            ; unsigned char
+%define SIZEOF_JSAMPLE     SIZEOF_BYTE     ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE  CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF              word            ; short
+%define SIZEOF_JCOEF       SIZEOF_WORD     ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION         dword           ; unsigned int
+%define SIZEOF_JDIMENSION  SIZEOF_DWORD    ; sizeof(JDIMENSION)
+
+%define JSAMPROW           POINTER         ; JSAMPLE *     (jpeglib.h)
+%define JSAMPARRAY         POINTER         ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE         POINTER         ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR           POINTER         ; JCOEF *       (jpeglib.h)
+%define SIZEOF_JSAMPROW    SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY  SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE  SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR    SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word         ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD  ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32         ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32  ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word         ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD  ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word         ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD  ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2            ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32         ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32  ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE   JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX    JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW  JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE    JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2   JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2   JSIMD_AVX2
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT   .text  align=32
+%define SEG_CONST  .rdata align=32
+%else
+%define SEG_TEXT   .text  align=32 public use32 class=CODE
+%define SEG_CONST  .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT    .text  align=32
+%define SEG_CONST   .rdata align=32
+%else
+%define SEG_TEXT    .text  align=32 public use64 class=CODE
+%define SEG_CONST   .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name)  name                ; foo() -> foo
+
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT   _text align=32 public use32 class=CODE
+%define SEG_CONST  _data align=32 public use32 class=DATA
+
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT   .text   progbits align=32
+%define SEG_CONST  .rodata progbits align=32
+%else
+%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
+%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
+%define EXTN(name)  name                   ; foo() -> foo
+
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
+
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
+%define SEG_CONST  .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
+
+%else           ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+%endif          ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER         qword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER         dword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
+%endif
+
+%define INT             dword           ; signed integer type
+%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32            dword           ; IEEE754 single
+%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD          qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                         ; int128 (SSE register)
+%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD                         ; int256 (AVX register)
+%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
+%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
+
+%define BYTE_BIT      8                 ; CHAR_BIT in C
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)  _ %+ name           ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Hidden symbols
+;
+%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
+%define GLOBAL_DATA(name)      global EXTN(name):data hidden
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name)  global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name)      global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC  ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+    SECTION     SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym)  (got) + (sym) - const_base
+
+%imacro get_GOT 1
+    ; NOTE: this macro destroys ecx resister.
+    call        %%geteip
+    add         ecx, byte (%%ref - $)
+    jmp         short %%adjust
+%%geteip:
+    mov         ecx, POINTER [esp]
+    ret
+%%adjust:
+    push        ebp
+    xor         ebp, ebp                ; ebp = 0
+%ifidni %1, ebx  ; (%1 == ebx)
+    ; db 0x8D,0x9C + jmp near const_base =
+    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+    db          0x8D, 0x9C              ; 8D,9C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+    ; db 0x8D,0x8C + jmp near const_base =
+    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+    db          0x8D, 0x8C              ; 8D,8C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+    mov         %1, ecx
+%endif  ; (%1 == ebx)
+    pop         ebp
+%endmacro
+
+%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+    extern      GOT_SYMBOL
+    call        %%geteip
+    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+    jmp         short %%done
+%%geteip:
+    mov         %1, POINTER [esp]
+    ret
+%%done:
+%endmacro
+
+%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+    push        %1
+%endmacro
+%imacro poppic  1.nolist
+    pop         %1
+%endmacro
+%imacro movpic  2.nolist
+    mov         %1, %2
+%endmacro
+
+%else    ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym)  (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic  1.nolist
+%endmacro
+%imacro movpic  2.nolist
+%endmacro
+
+%endif   ;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+        db 0x90                                      ; nop
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+        db 0x8B, 0xED                                ; mov ebp,ebp
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+        db 0x90                                      ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+    align       %1, db 0                ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm6
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm7
+    mov         r10, rcx
+%if %1 > 1
+    mov         r11, rdx
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, r8
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, r9
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, [rax+48]
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, [rax+56]
+%endif
+    push        rsi
+    push        rdi
+%endmacro
+
+%imacro uncollect_args 1
+    pop         rdi
+    pop         rsi
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+    movaps      xmm7, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+    movaps      xmm6, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+    sub         rsp, %1 * SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+    add         rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+    push        r10
+    mov         r10, rdi
+%if %1 > 1
+    push        r11
+    mov         r11, rsi
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, rdx
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, rcx
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, r8
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+%if %1 > 1
+    pop         r11
+%endif
+    pop         r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c
new file mode 100644
index 0000000000..170f90ff80
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolext-altivec.c
@@ -0,0 +1,269 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+    rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = { 0 };
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+  __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+    pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+    pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+  __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) },
+    pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = VEC_LD(0, tmpbuf);
+          rgb1 = VEC_LD(16, tmpbuf);
+          rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = VEC_LD(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = VEC_LD(0, inptr);
+          if (num_cols > 16)
+            rgb1 = VEC_LD(16, inptr);
+          if (num_cols > 32)
+            rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = VEC_LD(48, inptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr0);
+
+      /* Calculate Cb values */
+      cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+                                   (__vector unsigned int)cb0);
+      cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+                                   (__vector unsigned int)cb1);
+      cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+                                   (__vector unsigned int)cb2);
+      cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+                                   (__vector unsigned int)cb3);
+      cbl = vec_perm((__vector unsigned short)cb0,
+                     (__vector unsigned short)cb1, shift_pack_index);
+      cbh = vec_perm((__vector unsigned short)cb2,
+                     (__vector unsigned short)cb3, shift_pack_index);
+      cb = vec_pack(cbl, cbh);
+      vec_st(cb, 0, outptr1);
+
+      /* Calculate Cr values */
+      cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+                                   (__vector unsigned int)cr0);
+      cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+                                   (__vector unsigned int)cr1);
+      cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+                                   (__vector unsigned int)cr2);
+      cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+                                   (__vector unsigned int)cr3);
+      crl = vec_perm((__vector unsigned short)cr0,
+                     (__vector unsigned short)cr1, shift_pack_index);
+      crh = vec_perm((__vector unsigned short)cr2,
+                     (__vector unsigned short)cr3, shift_pack_index);
+      cr = vec_pack(crl, crh);
+      vec_st(cr, 0, outptr2);
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081  5329                 /* FIX(0.08131) */
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_168  11059                /* FIX(0.16874) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_331  21709                /* FIX(0.33126) */
+#define F_0_418  27439                /* FIX(0.41869) */
+#define F_0_500  32768                /* FIX(0.50000) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c
new file mode 100644
index 0000000000..b280cbbded
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcgryext-altivec.c
@@ -0,0 +1,228 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
+    rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = { 0 };
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = { 0 };
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh;
+  __vector int y0, y1, y2, y3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      { 0, 1, 4, 5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (num_cols > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (num_cols > 32)
+            rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = vec_ld(48, inptr);
+#endif
+        }
+      }
+#else
+      /* Little endian */
+      rgb0 = vec_vsx_ld(0, inptr);
+      if (num_cols > 16)
+        rgb1 = vec_vsx_ld(16, inptr);
+      if (num_cols > 32)
+        rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+      if (num_cols > 48)
+        rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr);
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c
new file mode 100644
index 0000000000..6e25b8db90
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample-altivec.c
@@ -0,0 +1,159 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+                                   int max_v_samp_factor,
+                                   JDIMENSION v_samp_factor,
+                                   JDIMENSION width_in_blocks,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data)
+{
+  int outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr, outptr;
+
+  __vector unsigned char this0, next0, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(0, 1) },
+    pw_one = { __8X(1) };
+  __vector unsigned char even_odd_index =
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      outl = vec_add(this0e, this0o);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_one);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        outh = vec_add(next0e, next0o);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_one);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+                              JDIMENSION v_samp_factor,
+                              JDIMENSION width_in_blocks,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+
+  __vector unsigned char this0, next0, this1, next1, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(1, 2) },
+    pw_two = { __8X(2) };
+  __vector unsigned char even_odd_index =
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr0);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      out0l = vec_add(this0e, this0o);
+
+      this1 = vec_ld(0, inptr1);
+      this1 = vec_perm(this1, this1, even_odd_index);
+      this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+      this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+      out1l = vec_add(this1e, this1o);
+
+      outl = vec_add(out0l, out1l);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_two);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr0);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        out0h = vec_add(next0e, next0o);
+
+        next1 = vec_ld(16, inptr1);
+        next1 = vec_perm(next1, next1, even_odd_index);
+        next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+        next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+        out1h = vec_add(next1e, next1o);
+
+        outh = vec_add(out0h, out1h);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_two);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c
new file mode 100644
index 0000000000..68d52bd8a2
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolext-altivec.c
@@ -0,0 +1,276 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                   JDIMENSION input_row, JSAMPARRAY output_buf,
+                                   int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+  __vector int g0, g1, g2, g3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+#endif
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+         inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+      y = vec_ld(0, inptr0);
+      /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      yl = (__vector signed short)VEC_UNPACKHU(y);
+      yh = (__vector signed short)VEC_UNPACKLU(y);
+
+      cb = vec_ld(0, inptr1);
+      cbl = (__vector signed short)VEC_UNPACKHU(cb);
+      cbh = (__vector signed short)VEC_UNPACKLU(cb);
+      cbl = vec_sub(cbl, pw_cj);
+      cbh = vec_sub(cbh, pw_cj);
+
+      cr = vec_ld(0, inptr2);
+      crl = (__vector signed short)VEC_UNPACKHU(cr);
+      crh = (__vector signed short)VEC_UNPACKLU(cr);
+      crl = vec_sub(crl, pw_cj);
+      crh = vec_sub(crh, pw_cj);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+      bl = vec_add(cbl, cbl);
+      bh = vec_add(cbh, cbh);
+      bl = vec_madds(bl, pw_mf0228, pw_one);
+      bh = vec_madds(bh, pw_mf0228, pw_one);
+      bl = vec_sra(bl, (__vector unsigned short)pw_one);
+      bh = vec_sra(bh, (__vector unsigned short)pw_one);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, yl);
+      bh = vec_add(bh, yh);
+
+      rl = vec_add(crl, crl);
+      rh = vec_add(crh, crh);
+      rl = vec_madds(rl, pw_f0402, pw_one);
+      rh = vec_madds(rh, pw_f0402, pw_one);
+      rl = vec_sra(rl, (__vector unsigned short)pw_one);
+      rh = vec_sra(rh, (__vector unsigned short)pw_one);
+      rl = vec_add(rl, crl);
+      rh = vec_add(rh, crh);
+      rl = vec_add(rl, yl);
+      rh = vec_add(rh, yh);
+
+      g0w = vec_mergeh(cbl, crl);
+      g1w = vec_mergel(cbl, crl);
+      g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+      g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+      g2w = vec_mergeh(cbh, crh);
+      g3w = vec_mergel(cbh, crh);
+      g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+      g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+      gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+      gl = vec_sub(gl, crl);
+      gh = vec_sub(gh, crh);
+      gl = vec_add(gl, yl);
+      gh = vec_add(gh, yh);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c
new file mode 100644
index 0000000000..40f02c33ea
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdmrgext-altivec.c
@@ -0,0 +1,329 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+    rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+  __vector int g_y0, g_y1, g_y2, g_y3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+    even_index =
+      {  0, 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30 },
+    odd_index =
+      {  0, 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31 };
+#else
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+    even_index =
+      { 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30,  0 },
+    odd_index =
+      { 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31,  0 };
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+    cb = vec_ld(0, inptr1);
+    /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+     * support unsigned vectors.
+     */
+    cbl = (__vector signed short)VEC_UNPACKHU(cb);
+    cbh = (__vector signed short)VEC_UNPACKLU(cb);
+    cbl = vec_sub(cbl, pw_cj);
+    cbh = vec_sub(cbh, pw_cj);
+
+    cr = vec_ld(0, inptr2);
+    crl = (__vector signed short)VEC_UNPACKHU(cr);
+    crh = (__vector signed short)VEC_UNPACKLU(cr);
+    crl = vec_sub(crl, pw_cj);
+    crh = vec_sub(crh, pw_cj);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+    b_yl = vec_add(cbl, cbl);
+    b_yh = vec_add(cbh, cbh);
+    b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+    b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+    b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+    b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+
+    r_yl = vec_add(crl, crl);
+    r_yh = vec_add(crh, crh);
+    r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+    r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+    r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+    r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+    r_yl = vec_add(r_yl, crl);
+    r_yh = vec_add(r_yh, crh);
+
+    g_y0w = vec_mergeh(cbl, crl);
+    g_y1w = vec_mergel(cbl, crl);
+    g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+    g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+    g_y2w = vec_mergeh(cbh, crh);
+    g_y3w = vec_mergel(cbh, crh);
+    g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+    g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+    /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+     * each dword into a new 16-bit vector, which is the equivalent of
+     * descaling the 32-bit results (right-shifting by 16 bits) and then
+     * packing them.
+     */
+    g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+                    shift_pack_index);
+    g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+                    shift_pack_index);
+    g_yl = vec_sub(g_yl, crl);
+    g_yh = vec_sub(g_yh, crh);
+
+    for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+         num_cols -= RGB_PIXELSIZE * 16,
+         outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+      y = vec_ld(0, inptr0);
+      ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+      yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+      if (yloop == 0) {
+        be = vec_add(b_yl, ye);
+        bo = vec_add(b_yl, yo);
+        re = vec_add(r_yl, ye);
+        ro = vec_add(r_yl, yo);
+        ge = vec_add(g_yl, ye);
+        go = vec_add(g_yl, yo);
+      } else {
+        be = vec_add(b_yh, ye);
+        bo = vec_add(b_yh, yo);
+        re = vec_add(r_yh, ye);
+        ro = vec_add(r_yh, yo);
+        ge = vec_add(g_yh, ye);
+        go = vec_add(g_yh, yo);
+      }
+
+      rl = vec_mergeh(re, ro);
+      rh = vec_mergel(re, ro);
+      gl = vec_mergeh(ge, go);
+      gh = vec_mergel(ge, go);
+      bl = vec_mergeh(be, bo);
+      bh = vec_mergel(be, bo);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c
new file mode 100644
index 0000000000..04df0cf108
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jdsample-altivec.c
@@ -0,0 +1,400 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
+    out;
+  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+    next0l, next0h, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+    last_index_col0 =
+      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
+    last_index =
+      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+    next_index =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
+    next_index_lastcol =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
+#if __BIG_ENDIAN__
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    if (downsampled_width & 15)
+      inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+    this0 = vec_ld(0, inptr);
+    p_last0 = vec_perm(this0, this0, last_index_col0);
+    last0 = this0;
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr += 16, outptr += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_last0 = vec_perm(last0, this0, last_index);
+        last0 = this0;
+      }
+
+      if (incol <= 16)
+        p_next0 = vec_perm(this0, this0, next_index_lastcol);
+      else {
+        next0 = vec_ld(16, inptr);
+        p_next0 = vec_perm(this0, next0, next_index);
+      }
+
+      this0e = (__vector short)vec_mule(this0, pb_three);
+      this0o = (__vector short)vec_mulo(this0, pb_three);
+      this0l = vec_mergeh(this0e, this0o);
+      this0h = vec_mergel(this0e, this0o);
+
+      last0l = (__vector short)VEC_UNPACKHU(p_last0);
+      last0h = (__vector short)VEC_UNPACKLU(p_last0);
+      last0l = vec_add(last0l, pw_one);
+
+      next0l = (__vector short)VEC_UNPACKHU(p_next0);
+      next0h = (__vector short)VEC_UNPACKLU(p_next0);
+      next0l = vec_add(next0l, pw_two);
+
+      outle = vec_add(this0l, last0l);
+      outlo = vec_add(this0l, next0l);
+      outle = vec_sr(outle, (__vector unsigned short)pw_two);
+      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr);
+
+      if (incol > 8) {
+        last0h = vec_add(last0h, pw_one);
+        next0h = vec_add(next0h, pw_two);
+
+        outhe = vec_add(this0h, last0h);
+        outho = vec_add(this0h, next0h);
+        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+        outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr);
+      }
+
+      this0 = next0;
+    }
+  }
+}
+
+
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char this_1, this0, this1, out;
+  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+    lastcolsum_1h, lastcolsum1h,
+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+    tmpl, tmph, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) },
+    last_index_col0 =
+      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
+    last_index =
+      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+    next_index =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+    next_index_lastcol =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
+#if __BIG_ENDIAN__
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
+#else
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
+#endif
+  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+  __vector unsigned short pw_four = { __8X(4) };
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 15) {
+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+    }
+
+    this0 = vec_ld(0, inptr0);
+    this0l = (__vector short)VEC_UNPACKHU(this0);
+    this0h = (__vector short)VEC_UNPACKLU(this0);
+    this0l = vec_mladd(this0l, pw_three, pw_zero);
+    this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+    this_1 = vec_ld(0, inptr_1);
+    this_1l = (__vector short)VEC_UNPACKHU(this_1);
+    this_1h = (__vector short)VEC_UNPACKLU(this_1);
+    thiscolsum_1l = vec_add(this0l, this_1l);
+    thiscolsum_1h = vec_add(this0h, this_1h);
+    lastcolsum_1h = thiscolsum_1h;
+    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+    this1 = vec_ld(0, inptr1);
+    this1l = (__vector short)VEC_UNPACKHU(this1);
+    this1h = (__vector short)VEC_UNPACKLU(this1);
+    thiscolsum1l = vec_add(this0l, this1l);
+    thiscolsum1h = vec_add(this0h, this1h);
+    lastcolsum1h = thiscolsum1h;
+    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+         outptr0 += 32, outptr1 += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
+      }
+
+      if (incol <= 16) {
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+                                   next_index_lastcol);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+                                  next_index_lastcol);
+      } else {
+        this0 = vec_ld(16, inptr0);
+        this0l = (__vector short)VEC_UNPACKHU(this0);
+        this0h = (__vector short)VEC_UNPACKLU(this0);
+        this0l = vec_mladd(this0l, pw_three, pw_zero);
+        this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+        this_1 = vec_ld(16, inptr_1);
+        this_1l = (__vector short)VEC_UNPACKHU(this_1);
+        this_1h = (__vector short)VEC_UNPACKLU(this_1);
+        nextcolsum_1l = vec_add(this0l, this_1l);
+        nextcolsum_1h = vec_add(this0h, this_1h);
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+        this1 = vec_ld(16, inptr1);
+        this1l = (__vector short)VEC_UNPACKHU(this1);
+        this1h = (__vector short)VEC_UNPACKLU(this1);
+        nextcolsum1l = vec_add(this0l, this1l);
+        nextcolsum1h = vec_add(this0h, this1h);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+      }
+
+      /* Process the upper row */
+
+      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum_1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum_1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr0);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum_1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum_1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr0);
+      }
+
+      /* Process the lower row */
+
+      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr1);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr1);
+      }
+
+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
+    }
+  }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr);
+      vec_st(inh, 16, outptr);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr);
+        vec_st(inh, 48, outptr);
+      }
+    }
+  }
+}
+
+
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr0);
+      vec_st(inl, 0, outptr1);
+
+      vec_st(inh, 16, outptr0);
+      vec_st(inh, 16, outptr1);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr0);
+        vec_st(inl, 32, outptr1);
+
+        vec_st(inh, 48, outptr0);
+        vec_st(inh, 48, outptr1);
+      }
+    }
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c
new file mode 100644
index 0000000000..ad9af81e0c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctfst-altivec.c
@@ -0,0 +1,154 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382  98   /* FIX(0.382683433) */
+#define F_0_541  139  /* FIX(0.541196100) */
+#define F_0_707  181  /* FIX(0.707106781) */
+#define F_1_306  334  /* FIX(1.306562965) */
+
+#define CONST_BITS  8
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out4  = vec_sub(tmp10, tmp11); \
+  \
+  z1 = vec_add(tmp12, tmp13); \
+  z1 = vec_sl(z1, pre_multiply_scale_bits); \
+  z1 = vec_madds(z1, pw_0707, pw_zero); \
+  \
+  out2 = vec_add(tmp13, z1); \
+  out6 = vec_sub(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = vec_add(tmp4, tmp5); \
+  tmp11 = vec_add(tmp5, tmp6); \
+  tmp12 = vec_add(tmp6, tmp7); \
+  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  z5 = vec_sub(tmp10, tmp12); \
+  z5 = vec_madds(z5, pw_0382, pw_zero); \
+  \
+  z2 = vec_madds(tmp10, pw_0541, z5); \
+  z4 = vec_madds(tmp12, pw_1306, z5); \
+  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero); \
+  \
+  z11 = vec_add(tmp7, z3); \
+  z13 = vec_sub(tmp7, z3); \
+  \
+  out5 = vec_add(z13, z2); \
+  out3 = vec_sub(z13, z2); \
+  out1 = vec_add(z11, z4); \
+  out7 = vec_sub(z11, z4); \
+}
+
+
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12); \
+  tmp1312h = vec_mergel(tmp13, tmp12); \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(tmp4, tmp6); \
+  z4 = vec_add(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7); \
+  tmp47h = vec_mergel(tmp4, tmp7); \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  \
+  out7 = vec_pack(out7l, out7h); \
+  out1 = vec_pack(out1l, out1h); \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6); \
+  tmp56h = vec_mergel(tmp5, tmp6); \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  \
+  out5 = vec_pack(out5l, out5h); \
+  out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_sl(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_sl(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_add(out0, pw_descale_p2x); \
+  out0  = vec_sra(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_add(out4, pw_descale_p2x); \
+  out4  = vec_sra(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c
new file mode 100644
index 0000000000..456c6c6174
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctfst-altivec.c
@@ -0,0 +1,255 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082  277              /* FIX(1.082392200) */
+#define F_1_414  362              /* FIX(1.414213562) */
+#define F_1_847  473              /* FIX(1.847759065) */
+#define F_2_613  669              /* FIX(2.613125930) */
+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in) { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(in##0, in##4); \
+  tmp11 = vec_sub(in##0, in##4); \
+  tmp13 = vec_add(in##2, in##6); \
+  \
+  tmp12 = vec_sub(in##2, in##6); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+  tmp12 = vec_sub(tmp12, tmp13); \
+  \
+  tmp0 = vec_add(tmp10, tmp13); \
+  tmp3 = vec_sub(tmp10, tmp13); \
+  tmp1 = vec_add(tmp11, tmp12); \
+  tmp2 = vec_sub(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = vec_add(in##5, in##3); \
+  z10 = vec_sub(in##5, in##3); \
+  z10s = vec_sl(z10, pre_multiply_scale_bits); \
+  z11 = vec_add(in##1, in##7); \
+  z12s = vec_sub(in##1, in##7); \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+  \
+  tmp11 = vec_sub(z11, z13); \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
+  \
+  tmp7 = vec_add(z11, z13); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = vec_add(z10s, z12s); \
+  z5 = vec_madds(z5, pw_F1847, pw_zero); \
+  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+  tmp10 = vec_sub(tmp10, z5); \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+  tmp12 = vec_sub(tmp12, z10); \
+  \
+  tmp6 = vec_sub(tmp12, tmp7); \
+  tmp5 = vec_sub(tmp11, tmp6); \
+  tmp4 = vec_add(tmp10, tmp5); \
+  \
+  out0 = vec_add(tmp0, tmp7); \
+  out1 = vec_add(tmp1, tmp6); \
+  out2 = vec_add(tmp2, tmp5); \
+  out3 = vec_sub(tmp3, tmp4); \
+  out4 = vec_add(tmp3, tmp4); \
+  out5 = vec_sub(tmp2, tmp5); \
+  out6 = vec_sub(tmp1, tmp6); \
+  out7 = vec_sub(tmp0, tmp7); \
+}
+
+
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+    pass1_bits3 = { __8X(PASS1_BITS + 3) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row);
+
+  out0 = vec_sra(out0, pass1_bits3);
+  out1 = vec_sra(out1, pass1_bits3);
+  out2 = vec_sra(out2, pass1_bits3);
+  out3 = vec_sra(out3, pass1_bits3);
+  out4 = vec_sra(out4, pass1_bits3);
+  out5 = vec_sra(out5, pass1_bits3);
+  out6 = vec_sra(out6, pass1_bits3);
+  out7 = vec_sra(out7, pass1_bits3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c
new file mode 100644
index 0000000000..60e619f11d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jidctint-altivec.c
@@ -0,0 +1,357 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  in##26l = vec_mergeh(in##2, in##6); \
+  in##26h = vec_mergel(in##2, in##6); \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+  \
+  tmp0 = vec_add(in##0, in##4); \
+  tmp1 = vec_sub(in##0, in##4); \
+  \
+  tmp0l = vec_unpackh(tmp0); \
+  tmp0h = vec_unpackl(tmp0); \
+  tmp0l = vec_sl(tmp0l, const_bits); \
+  tmp0h = vec_sl(tmp0h, const_bits); \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l); \
+  tmp10h = vec_add(tmp0h, tmp3h); \
+  tmp13l = vec_sub(tmp0l, tmp3l); \
+  tmp13h = vec_sub(tmp0h, tmp3h); \
+  \
+  tmp1l = vec_unpackh(tmp1); \
+  tmp1h = vec_unpackl(tmp1); \
+  tmp1l = vec_sl(tmp1l, const_bits); \
+  tmp1h = vec_sl(tmp1h, const_bits); \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l); \
+  tmp11h = vec_add(tmp1h, tmp2h); \
+  tmp12l = vec_sub(tmp1l, tmp2l); \
+  tmp12h = vec_sub(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(in##3, in##7); \
+  z4 = vec_add(in##1, in##5); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  in##71l = vec_mergeh(in##7, in##1); \
+  in##71h = vec_mergel(in##7, in##1); \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+  \
+  in##53l = vec_mergeh(in##5, in##3); \
+  in##53h = vec_mergel(in##5, in##3); \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = vec_add(tmp10l, tmp3l); \
+  out0h = vec_add(tmp10h, tmp3h); \
+  out7l = vec_sub(tmp10l, tmp3l); \
+  out7h = vec_sub(tmp10h, tmp3h); \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS); \
+  out0h = vec_sra(out0h, descale_p##PASS); \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  \
+  out0 = vec_pack(out0l, out0h); \
+  out7 = vec_pack(out7l, out7h); \
+  \
+  out1l = vec_add(tmp11l, tmp2l); \
+  out1h = vec_add(tmp11h, tmp2h); \
+  out6l = vec_sub(tmp11l, tmp2l); \
+  out6h = vec_sub(tmp11h, tmp2h); \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out1 = vec_pack(out1l, out1h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  out2l = vec_add(tmp12l, tmp1l); \
+  out2h = vec_add(tmp12h, tmp1h); \
+  out5l = vec_sub(tmp12l, tmp1l); \
+  out5h = vec_sub(tmp12h, tmp1h); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out5 = vec_pack(out5l, out5h); \
+  \
+  out3l = vec_add(tmp13l, tmp0l); \
+  out3h = vec_add(tmp13h, tmp0h); \
+  out4l = vec_sub(tmp13l, tmp0l); \
+  out4h = vec_sub(tmp13h, tmp0h); \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  out4l = vec_sra(out4l, descale_p##PASS); \
+  out4h = vec_sra(out4h, descale_p##PASS); \
+  \
+  out3 = vec_pack(out3l, out3h); \
+  out4 = vec_pack(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_zero = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c
new file mode 100644
index 0000000000..7d6e32542b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jquanti-altivec.c
@@ -0,0 +1,250 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_ld(0, elemptr); \
+  if ((size_t)elemptr & 15) \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
+}
+
+#else
+
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_vsx_ld(0, elemptr); \
+}
+
+#endif
+
+
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace)
+{
+  JSAMPROW elemptr;
+
+  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+  __vector unsigned char pb_zero = { __16X(0) };
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = (__vector short)VEC_UNPACKHU(in0);
+  out1 = (__vector short)VEC_UNPACKHU(in1);
+  out2 = (__vector short)VEC_UNPACKHU(in2);
+  out3 = (__vector short)VEC_UNPACKHU(in3);
+  out4 = (__vector short)VEC_UNPACKHU(in4);
+  out5 = (__vector short)VEC_UNPACKHU(in5);
+  out6 = (__vector short)VEC_UNPACKHU(in6);
+  out7 = (__vector short)VEC_UNPACKHU(in7);
+
+  out0 = vec_sub(out0, pw_centerjsamp);
+  out1 = vec_sub(out1, pw_centerjsamp);
+  out2 = vec_sub(out2, pw_centerjsamp);
+  out3 = vec_sub(out3, pw_centerjsamp);
+  out4 = vec_sub(out4, pw_centerjsamp);
+  out5 = vec_sub(out5, pw_centerjsamp);
+  out6 = vec_sub(out6, pw_centerjsamp);
+  out7 = vec_sub(out7, pw_centerjsamp);
+
+  vec_st(out0, 0, workspace);
+  vec_st(out1, 16, workspace);
+  vec_st(out2, 32, workspace);
+  vec_st(out3, 48, workspace);
+  vec_st(out4, 64, workspace);
+  vec_st(out5, 80, workspace);
+  vec_st(out6, 96, workspace);
+  vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT  16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+   We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) { \
+  tmpe = vec_mule((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  tmpo = vec_mulo((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+                                 (__vector unsigned short)tmpo, \
+                                 shift_pack_index); \
+}
+
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+  __vector unsigned int tmpe, tmpo;
+
+  /* Constants */
+  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+  __vector unsigned char shift_pack_index =
+    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
+#else
+  __vector unsigned char shift_pack_index =
+    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
+#endif
+
+  row0 = vec_ld(0, workspace);
+  row1 = vec_ld(16, workspace);
+  row2 = vec_ld(32, workspace);
+  row3 = vec_ld(48, workspace);
+  row4 = vec_ld(64, workspace);
+  row5 = vec_ld(80, workspace);
+  row6 = vec_ld(96, workspace);
+  row7 = vec_ld(112, workspace);
+
+  /* Branch-less absolute value */
+  row0s = vec_sra(row0, pw_word_bit_m1);
+  row1s = vec_sra(row1, pw_word_bit_m1);
+  row2s = vec_sra(row2, pw_word_bit_m1);
+  row3s = vec_sra(row3, pw_word_bit_m1);
+  row4s = vec_sra(row4, pw_word_bit_m1);
+  row5s = vec_sra(row5, pw_word_bit_m1);
+  row6s = vec_sra(row6, pw_word_bit_m1);
+  row7s = vec_sra(row7, pw_word_bit_m1);
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+  row0 = vec_add(row0, corr0);
+  row1 = vec_add(row1, corr1);
+  row2 = vec_add(row2, corr2);
+  row3 = vec_add(row3, corr3);
+  row4 = vec_add(row4, corr4);
+  row5 = vec_add(row5, corr5);
+  row6 = vec_add(row6, corr6);
+  row7 = vec_add(row7, corr7);
+
+  recip0 = vec_ld(0, divisors);
+  recip1 = vec_ld(16, divisors);
+  recip2 = vec_ld(32, divisors);
+  recip3 = vec_ld(48, divisors);
+  recip4 = vec_ld(64, divisors);
+  recip5 = vec_ld(80, divisors);
+  recip6 = vec_ld(96, divisors);
+  recip7 = vec_ld(112, divisors);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  vec_st(row0, 0, coef_block);
+  vec_st(row1, 16, coef_block);
+  vec_st(row2, 32, coef_block);
+  vec_st(row3, 48, coef_block);
+  vec_st(row4, 64, coef_block);
+  vec_st(row5, 80, coef_block);
+  vec_st(row6, 96, coef_block);
+  vec_st(row7, 112, coef_block);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..b9e86dcfac
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
@@ -0,0 +1,881 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+  uint32 altivec = 0;
+#elif defined(__OpenBSD__)
+  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+  int altivec;
+  size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+  unsigned long cpufeatures = 0;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__amigaos4__)
+  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+  if (altivec == VECTORTYPE_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__OpenBSD__)
+  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_ycc_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_ycc_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_gray_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_gray_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_gray_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_gray_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_ycc_extrgb_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_ycc_extbgr_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_ycc_rgb_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v2_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v1_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h
new file mode 100644
index 0000000000..e8bdb06a54
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd_altivec.h
@@ -0,0 +1,98 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a)      a, a, a, a
+#define __4X2(a, b)  a, b, a, b, a, b, a, b
+#define __8X(a)      __4X(a), __4X(a)
+#define __16X(a)     __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col) { \
+  __vector short row04l, row04h, row15l, row15h, \
+                 row26l, row26h, row37l, row37h; \
+  __vector short col01e, col01o, col23e, col23o, \
+                 col45e, col45o, col67e, col67o; \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+}
+
+#ifndef min
+#define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b)     vec_ld(a, b)
+#define VEC_ST(a, b, c)  vec_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a)  vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b)     vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c)  vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a)  vec_mergel(a, pb_zero)
+
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
+    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
+    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+    alignz      32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+    alignz      32
+
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
+
+%endif
+
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+%else
+    pop         r12
+    pop         rbp
+    pop         rbx
+%endif
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    mov         T1d, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, ZERO
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         rcx, 16
+    shl         rdx, 32
+    shl         rsi, 48
+
+    or          rax, rcx
+    or          rdx, rsi
+    or          rax, rdx
+
+    not         rax
+
+    mov         MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO    xmm9
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO    xmm9
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define KK      r9d
+%define EOB     r8d
+%define SIGN    rdi
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    xor         SIGN, SIGN
+    xor         EOB, EOB
+    xor         KK, KK
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 16
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    test        LEN, 8
+    jz          .TRYR7
+    test        LEN, 7
+    jz          .TRYR8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    shr         SIGN, 8
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    not         SIGN
+    sub         VALUES, DCTSIZE2*2
+    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+    REDUCE0
+
+    mov         eax, EOB
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    ; rcx can possibly be 8, 16, 24
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         short .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [rdx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3             ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6             ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7             ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3             ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6             ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7             ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4        ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5        ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [rsi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [rel PW_MF0228]   ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [rel PW_F0402]    ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [rel PW_ONE]
+    paddw       xmm1, [rel PW_ONE]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm4, [rel PW_MF0344_F0285]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [rel PW_MF0344_F0285]
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm4, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [rel PD_ONEHALF]
+    paddd       xmm5, [rel PD_ONEHALF]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [rsi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    vmovdqu     ymm6, YMMWORD [rbx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [rdx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [rel PW_MF0228]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [rel PW_F0402]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [rel PW_ONE]
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpsraw      ymm6, ymm6, 1                ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [rel PW_ONE]
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpsraw      ymm7, ymm7, 1                ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5             ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1             ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3             ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6        ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7        ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm6, ymm6, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm7, ymm7, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpaddd      ymm6, ymm6, [rel PD_ONEHALF]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm7, ymm7, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [rsi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    movdqa      xmm6, XMMWORD [rbx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [rdx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [rel PW_MF0228]   ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [rel PW_F0402]    ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [rel PW_ONE]
+    paddw       xmm4, [rel PW_ONE]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [rel PW_ONE]
+    paddw       xmm0, [rel PW_ONE]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+    pmaddwd     xmm6, [rel PW_MF0344_F0285]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm7, [rel PW_MF0344_F0285]
+
+    paddd       xmm5, [rel PD_ONEHALF]
+    paddd       xmm6, [rel PD_ONEHALF]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm7, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    push_xmm    3
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+
+    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpmullw     ymm1, ymm1, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm2, ymm2, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpaddw      ymm3, ymm3, [rel PW_TWO]
+    vpaddw      ymm6, ymm6, [rel PW_TWO]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    push_xmm    3
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm8, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm8, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm0, ymm8, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm8, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm8, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm8, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [rel PW_THREE]
+    vpmullw     ymm3, ymm3, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
+    vpaddw      ymm2, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm7, ymm8, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm8, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm8, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm8, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
+    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
+    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD    ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr
+    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm2, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    paddw       xmm3, [rel PW_TWO]
+    paddw       xmm6, [rel PW_TWO]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [rel PW_THREE]
+    pmullw      xmm3, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm5, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_SEVEN]
+    paddw       xmm2, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_EIGHT]
+    paddw       xmm7, [rel PW_SEVEN]
+    paddw       xmm5, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, byte 4*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .columnloop
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [rel PW_F0707]    ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z3
+
+    movdqa      xmm4, xmm2              ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [rel PW_F0382]    ; xmm2=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [rel PW_F1306]    ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2              ; xmm4=z2
+    paddw       xmm0, xmm2              ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [rel PW_F0707]    ; xmm0=z3
+
+    movdqa      xmm4, xmm7              ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [rel PW_F0382]    ; xmm7=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [rel PW_F1306]    ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7              ; xmm4=z2
+    paddw       xmm1, xmm7              ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [rel PW_DESCALE_P2X]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [rel PW_F130_F054_MF130_F054]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %6, %6, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [rel PW_MF078_F117_F078_F117]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [rel PW_MF060_MF089_MF050_MF256]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [rel PW_F050_MF256_F060_MF089]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [rel PW_F050_MF256_F060_MF089]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %5, %5, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+    uncollect_args 1
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [rel PW_F130_F054]   ; xmm7=data2L
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=data2H
+    pmaddwd     xmm4, [rel PW_F054_MF130]  ; xmm4=data6L
+    pmaddwd     xmm0, [rel PW_F054_MF130]  ; xmm0=data6H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm6, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm0, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3L
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3H
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4L
+    pmaddwd     xmm0, [rel PW_F117_F078]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [rel PW_MF089_F060]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [rel PW_MF050_MF256]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [rel PW_MF256_F050]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [rel PD_DESCALE_P1]
+    paddd       xmm3, [rel PD_DESCALE_P1]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [rel PW_DESCALE_P2X]
+    paddw       xmm5, [rel PW_DESCALE_P2X]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=data2L
+    pmaddwd     xmm2, [rel PW_F130_F054]   ; xmm2=data2H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=data6L
+    pmaddwd     xmm6, [rel PW_F054_MF130]  ; xmm6=data6H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm2, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm6, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3L
+    pmaddwd     xmm1, [rel PW_MF078_F117]  ; xmm1=z3H
+    pmaddwd     xmm2, [rel PW_F117_F078]   ; xmm2=z4L
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [rel PW_MF060_MF089]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [rel PW_MF089_F060]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [rel PW_MF089_F060]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [rel PW_MF050_MF256]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [rel PW_MF256_F050]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [rel PD_DESCALE_P2]
+    paddd       xmm7, [rel PD_DESCALE_P2]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [workspace]
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rcx, DCTSIZE/4          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         rcx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+    mov         rcx, DCTSIZE/4          ; ctr
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
+    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         rdi, byte 4*SIZEOF_JSAMPROW
+    dec         rcx                            ; ctr
+    jnz         near .rowloop
+
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [rel PW_F1414]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [rel PW_F1414]    ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [rel PW_F1847]    ; xmm5=z5
+    pmulhw      xmm0, [rel PW_MF1613]
+    pmulhw      xmm2, [rel PW_F1082]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F1414]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [rel PW_F1414]    ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [rel PW_F1847]    ; xmm4=z5
+    pmulhw      xmm6, [rel PW_MF1613]
+    pmulhw      xmm0, [rel PW_F1082]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [rel PW_F130_F054_MF130_F054]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [rel PW_1_NEG1]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %11, %8, %5             ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %9, %8, %5              ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %12, %1, %6             ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %10, %1, %6             ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [rel PW_MF078_F117_F078_F117]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [rel PW_MF060_MF089_MF050_MF256]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [rel PW_MF089_F060_MF256_F050]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [rel PW_MF089_F060_MF256_F050]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vpaddd      %1, %9, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %10, %8             ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %9, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %10, %8             ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vpaddd      %7, %11, %5             ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %8, %12, %6             ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %2, %7, %8              ; %2=data3_2
+
+    vpsubd      %7, %11, %5             ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %8, %12, %6             ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %3, %7, %8              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    push_xmm    4
+    collect_args 4
+
+    ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [rel PB_CENTERJSAMP]
+    vpaddb      ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         eax, r13d
+
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    uncollect_args 4
+    pop_xmm     4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [rel PW_F256_F089]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [rel PW_F256_F089]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [rel PW_F106_MF217]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [rel PW_F106_MF217]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [rel PW_MF060_MF050]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [rel PW_F145_MF021]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [rel PW_F184_MF076]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [rel PW_F184_MF076]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [rel PD_DESCALE_P1_4]  ; xmm6=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1_4]  ; xmm7=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [rel PW_F256_F089]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [rel PW_F106_MF217]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [rel PW_F184_MF076]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [rel PD_DESCALE_P2_4]  ; xmm1=[rel PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [rel PB_CENTERJSAMP]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [rel PW_F362_MF127]
+    pmaddwd     xmm5, [rel PW_F085_MF072]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [rel PW_F362_MF127]
+    pmaddwd     xmm2, [rel PW_F085_MF072]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [rel PD_DESCALE_P1_2]  ; xmm2=[rel PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [rel PW_F362_MF127]
+    pmaddwd     xmm7, [rel PW_F085_MF072]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [rel PD_DESCALE_P2_2]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [rel PB_CENTERJSAMP]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/2
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW
+    add         rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/16
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+    add         rsi, byte 16*SIZEOF_FAST_FLOAT
+    add         rdx, byte 16*SIZEOF_FAST_FLOAT
+    add         rdi, byte 16*SIZEOF_JCOEF
+    dec         rax
+    jnz         short .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         eax, r11d
+
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm3, ymm3, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/4
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 4*SIZEOF_JSAMPROW
+    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/32
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 32*SIZEOF_DCTELEM
+    add         rdx, byte 32*SIZEOF_DCTELEM
+    add         rdi, byte 32*SIZEOF_JCOEF
+    dec         rax
+    jnz         near .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..584a010ad3
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
@@ -0,0 +1,1068 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else
+    jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        rbx
+    push        rdi
+
+    xor         rdi, rdi                ; simd support flag
+
+    ; Assume that all x86-64 processors support SSE & SSE2 instructions
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    mov         rax, 0
+    cpuid
+    cmp         rax, 7
+    jl          short .return           ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         rax, 7
+    xor         rcx, rcx
+    cpuid
+    mov         rax, rbx                ; rax = Extended feature flags
+
+    test        rax, 1<<5               ; bit5:AVX2
+    jz          short .return
+
+    ; Check for AVX2 O/S support
+    mov         rax, 1
+    xor         rcx, rcx
+    cpuid
+    test        rcx, 1<<27
+    jz          short .return           ; O/S does not support XSAVE
+    test        rcx, 1<<28
+    jz          short .return           ; CPU does not support AVX2
+
+    xor         rcx, rcx
+    xgetbv
+    and         rax, 6
+    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .return
+
+    or          rdi, JSIMD_AVX2
+
+.return:
+    mov         rax, rdi
+
+    pop         rdi
+    pop         rbx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32

From 44b2f9637a7cac9069de276076db95a0d433a142 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 7 Aug 2022 15:43:10 +0300
Subject: [PATCH 168/178] Revert "suppress warning on GCC 7 and later"

This reverts commit a630ad73cb5dde7f8c69e86094bd4a1070ff0249.
---
 3rdparty/openexr/CMakeLists.txt   | 4 ----
 3rdparty/protobuf/CMakeLists.txt  | 4 ----
 modules/calib3d/CMakeLists.txt    | 4 ----
 modules/core/CMakeLists.txt       | 4 ----
 modules/dnn/CMakeLists.txt        | 4 ----
 modules/features2d/CMakeLists.txt | 4 ----
 modules/flann/CMakeLists.txt      | 4 ----
 modules/imgproc/CMakeLists.txt    | 4 ----
 modules/ml/CMakeLists.txt         | 4 ----
 modules/objdetect/CMakeLists.txt  | 4 ----
 modules/shape/CMakeLists.txt      | 4 ----
 modules/stitching/CMakeLists.txt  | 4 ----
 modules/ts/CMakeLists.txt         | 4 ----
 samples/cpp/CMakeLists.txt        | 4 ----
 14 files changed, 56 deletions(-)

diff --git a/3rdparty/openexr/CMakeLists.txt b/3rdparty/openexr/CMakeLists.txt
index 1fbfa4f857..88f60b23c0 100644
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@@ -110,10 +110,6 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused -Wsign-compare -Wundef -W
                                      -Wreorder
                                      -Wunused-result
 )
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wclass-memaccess)
 endif()
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index f8372dc1ad..f249d2dcc3 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -25,10 +25,6 @@ else()
                                        -Warray-bounds  # GCC 9+
   )
 endif()
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 if(CV_ICC)
   ocv_warnings_disable(CMAKE_CXX_FLAGS
       -wd265 -wd858 -wd873 -wd2196
diff --git a/modules/calib3d/CMakeLists.txt b/modules/calib3d/CMakeLists.txt
index 1c31ad1903..a38fdf18d8 100644
--- a/modules/calib3d/CMakeLists.txt
+++ b/modules/calib3d/CMakeLists.txt
@@ -3,10 +3,6 @@ set(debug_modules "")
 if(DEBUG_opencv_calib3d)
   list(APPEND debug_modules opencv_highgui)
 endif()
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 ocv_define_module(calib3d opencv_imgproc opencv_features2d ${debug_modules}
     WRAP java python js
 )
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 568a8afef1..beba9f804e 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -38,10 +38,6 @@ endif()
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
 endif()
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 
 if(CV_TRACE AND HAVE_ITT)
   add_definitions(-DOPENCV_WITH_ITT=1)
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 680be360ba..4019356392 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -35,10 +35,6 @@ else()
                                        -Wunused-parameter -Wsign-compare
   )
 endif()
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 endif()
diff --git a/modules/features2d/CMakeLists.txt b/modules/features2d/CMakeLists.txt
index 28af58565b..1d29320a14 100644
--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@@ -1,10 +1,6 @@
 set(the_description "2D Features Framework")
 
 ocv_add_dispatched_file(sift SSE4_1 AVX2 AVX512_SKX)
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 
 set(debug_modules "")
 if(DEBUG_opencv_features2d)
diff --git a/modules/flann/CMakeLists.txt b/modules/flann/CMakeLists.txt
index 5935f679d4..78e041129b 100644
--- a/modules/flann/CMakeLists.txt
+++ b/modules/flann/CMakeLists.txt
@@ -1,6 +1,2 @@
 set(the_description "Clustering and Search in Multi-Dimensional Spaces")
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 ocv_define_module(flann opencv_core WRAP python)
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 5787ec502c..3b45482481 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -12,10 +12,6 @@ ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(undistort SSE2 AVX2)
 ocv_define_module(imgproc opencv_core WRAP java python js)
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 
 ocv_check_environment_variables(OPENCV_IPP_GAUSSIAN_BLUR)
 option(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OFF)
diff --git a/modules/ml/CMakeLists.txt b/modules/ml/CMakeLists.txt
index 8b76e0c8ad..1b64cc4f17 100644
--- a/modules/ml/CMakeLists.txt
+++ b/modules/ml/CMakeLists.txt
@@ -1,6 +1,2 @@
 set(the_description "Machine Learning")
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 ocv_define_module(ml opencv_core WRAP java python)
diff --git a/modules/objdetect/CMakeLists.txt b/modules/objdetect/CMakeLists.txt
index 34b77eab6b..414e578099 100644
--- a/modules/objdetect/CMakeLists.txt
+++ b/modules/objdetect/CMakeLists.txt
@@ -1,8 +1,4 @@
 set(the_description "Object Detection")
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 ocv_define_module(objdetect opencv_core opencv_imgproc opencv_calib3d WRAP java python js)
 
 if(HAVE_QUIRC)
diff --git a/modules/shape/CMakeLists.txt b/modules/shape/CMakeLists.txt
index 209eb11f34..527a0c2f71 100644
--- a/modules/shape/CMakeLists.txt
+++ b/modules/shape/CMakeLists.txt
@@ -1,6 +1,2 @@
 set(the_description "Shape descriptors and matchers")
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 ocv_define_module(shape opencv_core opencv_imgproc opencv_video WRAP python)
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 7ea34a1608..44f35eb59b 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -3,10 +3,6 @@ set(the_description "Images stitching")
 if(HAVE_CUDA)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
 endif()
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 
 set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
 if(BUILD_SHARED_LIBS AND BUILD_opencv_world AND OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index fd6e008333..f95bed0793 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -15,10 +15,6 @@ if(WINRT)
 endif()
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  # suppress warnings from GCC only on 7.1 and later
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-endif()
 
 ocv_add_module(ts INTERNAL opencv_core opencv_imgproc opencv_imgcodecs opencv_videoio opencv_highgui)
 
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 8cb11c0222..6ae9586fd4 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -41,10 +41,6 @@ foreach(sample_filename ${cpp_samples})
       ocv_warnings_disable(CMAKE_CXX_FLAGS -Winconsistent-missing-override -Wsuggest-override)
     endif()
   endif()
-  if(ARM AND CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-    # suppress warnings from GCC only on 7.1 and later
-    ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-psabi)
-  endif()
   set(package "cpp")
   if(sample_filename MATCHES "tutorial_code")
     set(package "tutorial")

From 75bb6aa9a1571875257742d2bf5c5593c5cca6d2 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 7 Aug 2022 15:46:44 +0300
Subject: [PATCH 169/178] build: disable psabi warning with GCC 7.0 and ARM

---
 cmake/OpenCVCompilerOptions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index bbfd889690..db7b0e1b8c 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -134,7 +134,7 @@ if(CV_GCC OR CV_CLANG)
   add_extra_compiler_option(-Wshadow)
   add_extra_compiler_option(-Wsign-promo)
   add_extra_compiler_option(-Wuninitialized)
-  if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
+  if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0 OR ARM))
     add_extra_compiler_option(-Wno-psabi)
   endif()
   if(HAVE_CXX11)

From 496eed950f6d0e7fd92619d47e3cec8f06e96ace Mon Sep 17 00:00:00 2001
From: Jochen Sprickerhof <jspricke@debian.org>
Date: Mon, 8 Aug 2022 21:10:13 +0200
Subject: [PATCH 170/178] Add missing header for LIBAVCODEC_VERSION_INT

---
 modules/videoio/src/ffmpeg_codecs.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/videoio/src/ffmpeg_codecs.hpp b/modules/videoio/src/ffmpeg_codecs.hpp
index 61788e0345..faad2596ed 100644
--- a/modules/videoio/src/ffmpeg_codecs.hpp
+++ b/modules/videoio/src/ffmpeg_codecs.hpp
@@ -60,6 +60,7 @@ extern "C" {
 #include <errno.h>
 #endif
 
+#include <libavcodec/version.h>
 #include <libavformat/avformat.h>
 
 #ifdef __cplusplus

From 1bab920cf568c1420c37c5c688c27a59db0eca02 Mon Sep 17 00:00:00 2001
From: Lukas Weber <32765578+lukasalexanderweber@users.noreply.github.com>
Date: Thu, 11 Aug 2022 18:04:28 +0200
Subject: [PATCH 171/178] remove static con_thresh for adjuster

---
 samples/python/stitching_detailed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/python/stitching_detailed.py b/samples/python/stitching_detailed.py
index 56d6965733..f8f580dd56 100644
--- a/samples/python/stitching_detailed.py
+++ b/samples/python/stitching_detailed.py
@@ -368,7 +368,7 @@ def main():
         cam.R = cam.R.astype(np.float32)
 
     adjuster = BA_COST_CHOICES[args.ba]()
-    adjuster.setConfThresh(1)
+    adjuster.setConfThresh(conf_thresh)
     refine_mask = np.zeros((3, 3), np.uint8)
     if ba_refine_mask[0] == 'x':
         refine_mask[0, 0] = 1

From 3d8614cb476341267ecd195d77f4aa6ab9792275 Mon Sep 17 00:00:00 2001
From: catree <catree.catreus@outlook.com>
Date: Fri, 12 Aug 2022 10:37:09 +0200
Subject: [PATCH 172/178] Fix documentation about depth combinations for
 imgproc functions.

---
 modules/imgproc/include/opencv2/imgproc.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index c04b5b67a9..2ea2ec969e 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -78,11 +78,15 @@ Input depth (src.depth()) | Output depth (ddepth)
 --------------------------|----------------------
 CV_8U                     | -1/CV_16S/CV_32F/CV_64F
 CV_16U/CV_16S             | -1/CV_32F/CV_64F
-CV_32F                    | -1/CV_32F/CV_64F
+CV_32F                    | -1/CV_32F
 CV_64F                    | -1/CV_64F
 
 @note when ddepth=-1, the output image will have the same depth as the source.
 
+@note if you need double floating-point accuracy and using single floating-point input data
+(CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
+the input data to the desired precision.
+
     @defgroup imgproc_transform Geometric Image Transformations
 
 The functions in this section perform various geometrical transformations of 2D images. They do not
@@ -1753,7 +1757,7 @@ with the following \f$3 \times 3\f$ aperture:
 
 @param src Source image.
 @param dst Destination image of the same size and the same number of channels as src .
-@param ddepth Desired depth of the destination image.
+@param ddepth Desired depth of the destination image, see @ref filter_depths "combinations".
 @param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
 details. The size must be positive and odd.
 @param scale Optional scale factor for the computed Laplacian values. By default, no scaling is

From c2c539e3cc650214404e86183c296d8ff9d95ecd Mon Sep 17 00:00:00 2001
From: Biswapriyo Nath <nathbappai@gmail.com>
Date: Sun, 14 Aug 2022 19:26:08 +0530
Subject: [PATCH 173/178] videoio: Include missing condition_variable header

This fixes the following error with mingw toolchain:
opencv/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp:160:10: error: 'condition_variable' in namespace 'std' does not name a type
  160 |     std::condition_variable streamStateCv_;
      |          ^~~~~~~~~~~~~~~~~~
---
 .../videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
index aa40fe53be..2891ff488a 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.hpp
@@ -25,6 +25,8 @@
 
 #include "obsensor_uvc_stream_channel.hpp"
 
+#include <condition_variable>
+
 #include <windows.h>
 #include <guiddef.h>
 #include <mfapi.h>

From d3dcef4b8b823f2cd0dc2b694058165f4cd61110 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Mon, 15 Aug 2022 16:44:25 +0800
Subject: [PATCH 174/178] add new check for vidcap.h

---
 modules/videoio/cmake/detect_obsensor.cmake | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modules/videoio/cmake/detect_obsensor.cmake b/modules/videoio/cmake/detect_obsensor.cmake
index 73eb638b07..140e9a2e50 100644
--- a/modules/videoio/cmake/detect_obsensor.cmake
+++ b/modules/videoio/cmake/detect_obsensor.cmake
@@ -2,10 +2,20 @@
 if(NOT HAVE_OBSENSOR)
   if(WIN32)
     check_include_file(mfapi.h HAVE_MFAPI)
-    if(HAVE_MFAPI)
+    check_include_file(vidcap.h HAVE_VIDCAP)
+    if(HAVE_MFAPI AND HAVE_VIDCAP)
       set(HAVE_OBSENSOR TRUE)
       set(HAVE_OBSENSOR_MSMF TRUE)
       ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_MSMF")
+    else()
+      set(HAVE_OBSENSOR OFF)
+      set(HAVE_OBSENSOR_MSMF OFF)
+      if(NOT HAVE_MFAPI)
+        MESSAGE(STATUS "Could not find mfapi.h. Turning HAVE_OBSENSOR OFF")
+      endif()
+      if(NOT HAVE_VIDCAP)
+        MESSAGE(STATUS "Could not find vidcap.h. Turning HAVE_OBSENSOR OFF")
+      endif()
     endif()
   elseif(UNIX)
     check_include_file(linux/videodev2.h HAVE_CAMV4L2)

From 11c5a6bb4df0bd668afaa8bfe6781f1fbf4d89f6 Mon Sep 17 00:00:00 2001
From: hzcyf <yufeng@orbbec.com>
Date: Thu, 18 Aug 2022 03:34:46 +0800
Subject: [PATCH 175/178] Fix infinite loop in obsensor

---
 .../src/cap_obsensor/obsensor_stream_channel_msmf.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
index 5ac204e504..65d73497ea 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_msmf.cpp
@@ -25,6 +25,7 @@
 #include "obsensor_stream_channel_msmf.hpp"
 
 #include <shlwapi.h> // QISearch
+#include <Mferror.h>
 
 #pragma warning(disable : 4503)
 #pragma comment(lib, "mfplat")
@@ -98,7 +99,7 @@ bool parseUvcDeviceSymbolicLink(const std::string& symbolicLink, uint16_t& vid,
     std::string lowerStr = symbolicLink;
     for (size_t i = 0; i < lowerStr.length(); i++)
     {
-        lowerStr[i] = (char)tolower(lowerStr[i]); 
+        lowerStr[i] = (char)tolower(lowerStr[i]);
     }
     auto tokens = stringSplit(lowerStr, '#');
     if (tokens.size() < 1 || (tokens[0] != R"(\\?\usb)" && tokens[0] != R"(\\?\hid)"))
@@ -317,8 +318,12 @@ void MSMFStreamChannel::start(const StreamProfile& profile, FrameCallback frameC
     {
         for (uint32_t k = 0;; k++)
         {
-            HR_FAILED_EXEC(streamReader_->GetNativeMediaType(index, k, &mediaType), { continue; })
-                GUID subtype;
+            auto hr = streamReader_->GetNativeMediaType(index, k, &mediaType);
+            if(hr == MF_E_INVALIDSTREAMNUMBER || hr == MF_E_NO_MORE_TYPES){
+                break;
+            }
+            HR_FAILED_EXEC(hr, { continue; })
+            GUID subtype;
             HR_FAILED_RETURN(mediaType->GetGUID(MF_MT_SUBTYPE, &subtype));
             HR_FAILED_RETURN(MFGetAttributeSize(mediaType.Get(), MF_MT_FRAME_SIZE, &width, &height));
             HR_FAILED_RETURN(MFGetAttributeRatio(mediaType.Get(), MF_MT_FRAME_RATE_RANGE_MIN, &frameRateMin.numerator, &frameRateMin.denominator));

From 03130548eaa89e5491f669bf96af324890a26374 Mon Sep 17 00:00:00 2001
From: Dmtiry Matveev <dmitry.matveev@intel.com>
Date: Thu, 18 Aug 2022 07:50:22 +0300
Subject: [PATCH 176/178] G-API: Bump ADE to version 0.1.2

---
 modules/gapi/cmake/DownloadADE.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index aa24e949ea..7a54f62cae 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -1,7 +1,7 @@
 set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
-set(ade_filename "v0.1.1f.zip")
-set(ade_subdir "ade-0.1.1f")
-set(ade_md5 "b624b995ec9c439cbc2e9e6ee940d3a2")
+set(ade_filename "v0.1.2.zip")
+set(ade_subdir "ade-0.1.2")
+set(ade_md5 "561c1e28ccf27ad0557a18e251c22226")
 ocv_download(FILENAME ${ade_filename}
              HASH ${ade_md5}
              URL

From 984e42b0bc7f3c02d7ea257a0f7f409a27b1b8b3 Mon Sep 17 00:00:00 2001
From: Biswapriyo Nath <nathbappai@gmail.com>
Date: Fri, 19 Aug 2022 13:14:57 +0530
Subject: [PATCH 177/178] videoio: Include missing locale header for
 std::wstring_convert

This fixes the following error with mingw toolchain:
opencv/modules/videoio/src/cap_msmf.cpp:1020: error: 'wstring_convert' is not a member of 'std'
 1020 |                                     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
      |                                          ^~~~~~~~~~~~~~~
opencv/modules/videoio/src/cap_ffmpeg_hw.hpp:230:26: error: 'wstring_convert' is not a member of 'std'
  230 |                     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
      |                          ^~~~~~~~~~~~~~~

The locale header is required according to C++ standard.
See https://en.cppreference.com/w/cpp/locale/wstring_convert
---
 modules/videoio/src/cap_ffmpeg_hw.hpp | 1 +
 modules/videoio/src/cap_msmf.cpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/modules/videoio/src/cap_ffmpeg_hw.hpp b/modules/videoio/src/cap_ffmpeg_hw.hpp
index 31db5fdd5d..0998ee9f48 100644
--- a/modules/videoio/src/cap_ffmpeg_hw.hpp
+++ b/modules/videoio/src/cap_ffmpeg_hw.hpp
@@ -21,6 +21,7 @@
 #define D3D11_NO_HELPERS
 #include <d3d11.h>
 #include <codecvt>
+#include <locale>
 #include "opencv2/core/directx.hpp"
 #ifdef HAVE_OPENCL
 #include <CL/cl_d3d11.h>
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index d3002a5151..f28c0bbdb3 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -29,6 +29,7 @@
 #ifdef HAVE_MSMF_DXVA
 #include <d3d11.h>
 #include <d3d11_4.h>
+#include <locale>
 #endif
 #include <new>
 #include <map>

From 925ff6241ffe46e9e2687d5c527196e99f55282e Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Sat, 20 Aug 2022 18:24:41 +0100
Subject: [PATCH 178/178] Merge pull request #22393 from
 TolyaTalamanov:at/fix-gapi-vpl-windows-build

* Fix G-API OneVPL compilation

* Fix macro

__WIN32__ -> _WIN32
---
 .../gapi/src/streaming/onevpl/cfg_param_device_selector.cpp | 6 +++---
 modules/gapi/test/streaming/gapi_streaming_tests.cpp        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
index 68d4fce1b9..26abe3cfb4 100644
--- a/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
+++ b/modules/gapi/src/streaming/onevpl/cfg_param_device_selector.cpp
@@ -250,7 +250,7 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const CfgParams& cfg_params) :
             GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
 #endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
 #else // #ifdef __linux__
-            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only");
 #endif // #ifdef __linux__
             break;
         }
@@ -341,7 +341,7 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(Device::Ptr device_ptr,
             GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
 #endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
 #else // #ifdef __linux__
-            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only");
 #endif // #ifdef __linux__
             break;
         }
@@ -400,7 +400,7 @@ CfgParamDeviceSelector::CfgParamDeviceSelector(const Device &device,
             GAPI_Assert(false && "VPLVAAPIAccelerationPolicy unavailable in current linux configuration");
 #endif // defined(HAVE_VA) || defined(HAVE_VA_INTEL)
 #else // #ifdef __linux__
-            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only")
+            GAPI_Assert(false && "MFX_IMPL_VIA_VAAPI is supported on linux only");
 #endif // #ifdef __linux__
             break;
         case AccelType::HOST:
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index 5cac4df845..111f43bd19 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -2440,7 +2440,7 @@ TEST(OneVPL_Source, Init)
 
     std::vector<CfgParam> src_params;
     src_params.push_back(CfgParam::create_implementation(MFX_IMPL_TYPE_HARDWARE));
-#ifdef __WIN32
+#ifdef _WIN32
     src_params.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_D3D11));
 #elif defined(__linux__)
     src_params.push_back(CfgParam::create_acceleration_mode(MFX_ACCEL_MODE_VIA_VAAPI));