Merge remote-tracking branch 'upstream/3.4' into merge-3.4

OpenCV FFmpeg wrapper download links are preserved from ffmpeg/master branch
7 years ago · 0d6518aaa0
parent d1a598c652 54e8487f56
commit 0d6518aaa0
177 changed files with 5628 additions and 2489 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -81,6 +81,8 @@ org.eclipse.jdt.core.prefs -text whitespace=cr-at-eol merge=union
 *.cmd       text eol=crlf
 *.cmd.tmpl  text eol=crlf
 *.dsp       text eol=crlf -whitespace
+*.ps1       text eol=crlf
+*.ps1.in    text eol=crlf
 *.sln       text eol=crlf -whitespace
 *.vcproj    text eol=crlf -whitespace merge=union
 *.vcxproj   text eol=crlf -whitespace merge=union
--- a/3rdparty/ffmpeg/ffmpeg-download.ps1.in
+++ b/3rdparty/ffmpeg/ffmpeg-download.ps1.in
@ -0,0 +1,63 @@
+$url = "https://raw.githubusercontent.com/opencv/opencv_3rdparty/@FFMPEG_BINARIES_COMMIT@/ffmpeg/opencv_ffmpeg_64.dll"
+$expected_md5 = "@FFMPEG_FILE_HASH_BIN64@"
+$output = "$PSScriptRoot\@OPENCV_BIN_INSTALL_PATH@\opencv_ffmpeg@OPENCV_DLLVERSION@_64.dll"
+
+Write-Output ("=" * 120)
+try {
+    Get-content -Path "$PSScriptRoot\etc\licenses\ffmpeg-readme.txt" -ErrorAction 'Stop'
+} catch {
+    Write-Output "Refer to OpenCV FFmpeg wrapper readme notes about library usage / licensing details."
+}
+Write-Output ("=" * 120)
+Write-Output ""
+
+if(![System.IO.File]::Exists($output)) {
+    try {
+        Write-Output ("Downloading: " + $output)
+        Import-Module BitsTransfer
+        $start_time = Get-Date
+        Start-BitsTransfer -Source $url -Destination $output -ErrorAction 'Stop'
+        Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
+    } catch {
+        $_ # Dump error
+        try {
+            Write-Output ("Downloading (second attempt): " + $output)
+            $start_time = Get-Date
+            Invoke-WebRequest -Uri $url -OutFile $output
+            Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
+        } catch {
+            Write-Output ("Can't download file: " + $output)
+            Write-Output ("URL: " + $url)
+            Write-Output "You need to download this file manually. Stop"
+            Pause
+            Exit
+        }
+    }
+} else {
+    Write-Output ("File exists: " + $output)
+    Write-Output ("Downloading is skipped. Remove this file and re-run this script to force downloading.")
+}
+
+if(![System.IO.File]::Exists($output)) {
+    Write-Output ("Destination file not found: " + $output)
+    Write-Output "Stop"
+    Pause
+    Exit
+}
+
+try {
+    $hash = Get-FileHash $output -Algorithm MD5 -ErrorAction 'Stop'
+
+    if($hash.Hash -eq $expected_md5) {
+        Write-Output "MD5 check passed"
+    } else {
+        Write-Output ("MD5     : " + $hash.Hash.toLower())
+        Write-Output ("Expected: " + $expected_md5)
+        Write-Output "MD5 hash mismatch"
+    }
+} catch {
+    $_ # Dump error
+    Write-Output "Can't check MD5 hash (requires PowerShell 4+)"
+}
+Pause
+Write-Output "Exit"
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@ -35,3 +35,8 @@ function(download_win_ffmpeg script_var)
    set(${script_var} "${FFMPEG_DOWNLOAD_DIR}/ffmpeg_version.cmake" PARENT_SCOPE)
  endif()
 endfunction()
+
+if(OPENCV_INSTALL_FFMPEG_DOWNLOAD_SCRIPT)
+  configure_file("${CMAKE_CURRENT_LIST_DIR}/ffmpeg-download.ps1.in" "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" @ONLY)
+  install(FILES "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" DESTINATION "." COMPONENT libs)
+endif()
--- a/3rdparty/libwebp/src/dec/frame_dec.c
+++ b/3rdparty/libwebp/src/dec/frame_dec.c
@ -400,7 +400,9 @@ static void DitherRow(VP8Decoder* const dec) {
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB

 // Finalize and transmit a complete row. Return false in case of user-abort.
-static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+static int FinishRow(void* arg1, void* arg2) {
+  VP8Decoder* const dec = (VP8Decoder*)arg1;
+  VP8Io* const io = (VP8Io*)arg2;
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
@ -448,10 +450,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    if (y_end > io->crop_bottom) {
      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    }
+    // If dec->alpha_data_ is not NULL, we have some alpha plane present.
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // good idea.
      io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@ -558,7 +559,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  if (io->bypass_filtering) {
    dec->filter_type_ = 0;
  }
-  // TODO(skal): filter type / strength / sharpness forcing

  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
@ -569,8 +569,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Means: there's a dependency chain that goes all the way up to the
  // top-left corner of the picture (MB #0). We must filter all the previous
  // macroblocks.
-  // TODO(skal): add an 'approximate_decoding' option, that won't produce
-  // a 1:1 bit-exactness for complex filtering?
  {
    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
    if (dec->filter_type_ == 2) {
@ -651,7 +649,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
    }
    worker->data1 = dec;
    worker->data2 = (void*)&dec->thread_ctx_.io_;
-    worker->hook = (WebPWorkerHook)FinishRow;
+    worker->hook = FinishRow;
    dec->num_caches_ =
      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
  } else {
--- a/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8_dec.c
@ -491,7 +491,7 @@ static int GetCoeffsAlt(VP8BitReader* const br,
  return 16;
 }

-WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
  if (GetCoeffs == NULL) {
    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
      GetCoeffs = GetCoeffsAlt;
--- a/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8i_dec.h
@ -30,9 +30,9 @@ extern "C" {
 // Various defines and enums

 // version numbers
-#define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 1
+#define DEC_MAJ_VERSION 1
+#define DEC_MIN_VERSION 0
+#define DEC_REV_VERSION 0

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
--- a/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8l_dec.c
@ -1643,17 +1643,17 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

 #if !defined(WEBP_REDUCE_SIZE)
    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
-
-    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
-      // need the alpha-multiply functions for premultiplied output or rescaling
-      WebPInitAlphaProcessing();
-    }
 #else
    if (io->use_scaling) {
      dec->status_ = VP8_STATUS_INVALID_PARAM;
      goto Err;
    }
 #endif
+    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+      // need the alpha-multiply functions for premultiplied output or rescaling
+      WebPInitAlphaProcessing();
+    }
+
    if (!WebPIsRGBMode(dec->output_->colorspace)) {
      WebPInitConvertARGBToYUV();
      if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
--- a/3rdparty/libwebp/src/demux/demux.c
+++ b/3rdparty/libwebp/src/demux/demux.c
@ -23,9 +23,9 @@
 #include "src/webp/demux.h"
 #include "src/webp/format_constants.h"

-#define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 3
+#define DMUX_MAJ_VERSION 1
+#define DMUX_MIN_VERSION 0
+#define DMUX_REV_VERSION 0

 typedef struct {
  size_t start_;        // start location of the data
--- a/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing.c
@ -366,6 +366,16 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
 }

+#ifdef WORDS_BIGENDIAN
+static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                       const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+#endif
+
 static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                      int len, int step, uint32_t* out) {
  int i, offset = 0;
@ -381,6 +391,10 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+#ifdef WORDS_BIGENDIAN
+void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int, uint32_t*);
+#endif
 void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                    int len, int step, uint32_t* out);

@ -395,16 +409,14 @@ extern void WebPInitAlphaProcessingSSE2(void);
 extern void WebPInitAlphaProcessingSSE41(void);
 extern void WebPInitAlphaProcessingNEON(void);

-static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
-    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
-  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
  WebPMultARGBRow = WebPMultARGBRow_C;
  WebPMultRow = WebPMultRow_C;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;

+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_C;
+#endif
  WebPPackRGB = PackRGB_C;
 #if !WEBP_NEON_OMIT_C_CODE
  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
@ -451,9 +463,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  assert(WebPDispatchAlphaToGreen != NULL);
  assert(WebPExtractAlpha != NULL);
  assert(WebPExtractGreen != NULL);
+#ifdef WORDS_BIGENDIAN
+  assert(WebPPackARGB != NULL);
+#endif
  assert(WebPPackRGB != NULL);
  assert(WebPHasAlpha8b != NULL);
  assert(WebPHasAlpha32b != NULL);
-
-  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@ -125,6 +125,49 @@ static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
  }
 }

+#ifdef WORDS_BIGENDIAN
+static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
+                               const uint8_t* g, const uint8_t* b, int len,
+                               uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+#endif  // WORDS_BIGENDIAN
+
 static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
                              const uint8_t* b, int len, int step,
                              uint32_t* out) {
@ -172,6 +215,9 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_MIPSdspR2;
+#endif
  WebPPackRGB = PackRGB_MIPSdspR2;
 }

--- a/3rdparty/libwebp/src/dsp/argb.c
+++ b/3rdparty/libwebp/src/dsp/argb.c
@ -1,68 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions.
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-  }
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int i, offset = 0;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
-    offset += step;
-  }
-}
-
-void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                    const uint8_t*, int, uint32_t*);
-void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                   int, int, uint32_t*);
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-extern void VP8EncDspARGBInitSSE2(void);
-
-static volatile VP8CPUInfo argb_last_cpuinfo_used =
-    (VP8CPUInfo)&argb_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
-  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8EncDspARGBInitSSE2();
-    }
-#endif
-#if defined(WEBP_USE_MIPS_DSP_R2)
-    if (VP8GetCPUInfo(kMIPSdspR2)) {
-      VP8EncDspARGBInitMIPSdspR2();
-    }
-#endif
-  }
-  argb_last_cpuinfo_used = VP8GetCPUInfo;
-}
--- a/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
@ -1,110 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (mips version).
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_MIPS_DSP_R2)
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int temp0, temp1, temp2, temp3, offset;
-  const int rest = len & 1;
-  const uint32_t* const loop_end = out + len - rest;
-  const int step = 4;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int temp0, temp1, temp2, offset;
-  const int rest = len & 1;
-  const int a = 0xff;
-  const uint32_t* const loop_end = out + len - rest;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-}
-
-#else  // !WEBP_USE_MIPS_DSP_R2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
-
-#endif  // WEBP_USE_MIPS_DSP_R2
--- a/3rdparty/libwebp/src/dsp/argb_sse2.c
+++ b/3rdparty/libwebp/src/dsp/argb_sse2.c
@ -1,70 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (SSE2 version).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
-    int i = 0;
-    const int len_max = len & ~3;  // max length processed in main loop
-    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
-    assert(b == r + 2);
-    assert(a == r + 3);
-    for (; i < len_max; i += 4) {
-      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
-      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
-      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
-      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i F = _mm_or_si128(E, C);
-      _mm_storeu_si128((__m128i*)(out + i), F);
-    }
-    for (; i < len; ++i) {
-      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-    }
-  } else {
-    assert(g == b + 1);
-    assert(r == b + 2);
-    assert(a == b + 3);
-    memcpy(out, b, len * 4);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
-  extern void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                    const uint8_t*, int, uint32_t*);
-
-  VP8PackARGB = PackARGB;
-}
-
-#else  // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
-
-#endif  // WEBP_USE_SSE2
--- a/3rdparty/libwebp/src/dsp/common_sse2.h
+++ b/3rdparty/libwebp/src/dsp/common_sse2.h
@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
-                                       __m128i* const in2, __m128i* const in3,
-                                       __m128i* const in4, __m128i* const in5) {
+static WEBP_INLINE void VP8PlanarTo24b_SSE2(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
  // The input is 6 registers of sixteen 8b but for the sake of explanation,
  // let's take 6 registers of four 8b values.
  // To pack, we will keep taking one every two 8b integer and move it
@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,

 // Convert four packed four-channel buffers like argbargbargbargb... into the
 // split channels aaaaa ... rrrr ... gggg .... bbbbb ......
-static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
-                                        __m128i* const in1,
-                                        __m128i* const in2,
-                                        __m128i* const in3) {
+static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
+                                             __m128i* const in1,
+                                             __m128i* const in2,
+                                             __m128i* const in3) {
  // Column-wise transpose.
  const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
  const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
--- a/3rdparty/libwebp/src/dsp/common_sse41.h
+++ b/3rdparty/libwebp/src/dsp/common_sse41.h
@ -0,0 +1,132 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 code common to several files.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_COMMON_SSE41_H_
+#define WEBP_DSP_COMMON_SSE41_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+// Channel mixing.
+// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ...
+#define WEBP_SSE41_SHUFF(OUT, IN0, IN1)    \
+  OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \
+  OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \
+  OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \
+  OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \
+  OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \
+  OUT##5 = _mm_shuffle_epi8(*IN1, shuff2);
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void VP8PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
+  __m128i R0, R1, R2, R3, R4, R5;
+  __m128i G0, G1, G2, G3, G4, G5;
+  __m128i B0, B1, B2, B3, B4, B5;
+
+  // Process R.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1);
+    WEBP_SSE41_SHUFF(R, in0, in1)
+  }
+
+  // Process G.
+  {
+    // Same as before, just shifted to the left by one and including the right
+    // padding.
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1);
+    WEBP_SSE41_SHUFF(G, in2, in3)
+  }
+
+  // Process B.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+      15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10);
+    WEBP_SSE41_SHUFF(B, in4, in5)
+  }
+
+  // OR the different channels.
+  {
+    const __m128i RG0 = _mm_or_si128(R0, G0);
+    const __m128i RG1 = _mm_or_si128(R1, G1);
+    const __m128i RG2 = _mm_or_si128(R2, G2);
+    const __m128i RG3 = _mm_or_si128(R3, G3);
+    const __m128i RG4 = _mm_or_si128(R4, G4);
+    const __m128i RG5 = _mm_or_si128(R5, G5);
+    *in0 = _mm_or_si128(RG0, B0);
+    *in1 = _mm_or_si128(RG1, B1);
+    *in2 = _mm_or_si128(RG2, B2);
+    *in3 = _mm_or_si128(RG3, B3);
+    *in4 = _mm_or_si128(RG4, B4);
+    *in5 = _mm_or_si128(RG5, B5);
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert four packed four-channel buffers like argbargbargbargb... into the
+// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
+static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0,
+                                              __m128i* const in1,
+                                              __m128i* const in2,
+                                              __m128i* const in3) {
+  // aaaarrrrggggbbbb
+  const __m128i shuff0 =
+      _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0);
+  const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0);
+  // A0A1R0R1
+  // G0G1B0B1
+  // A2A3R2R3
+  // G0G1B0B1
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi32(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_unpackhi_epi32(A2, A3);
+  *in3 = _mm_unpacklo_epi64(B0, B2);
+  *in2 = _mm_unpackhi_epi64(B0, B2);
+  *in1 = _mm_unpacklo_epi64(B1, B3);
+  *in0 = _mm_unpackhi_epi64(B1, B3);
+}
+
+#endif  // WEBP_USE_SSE41
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_DSP_COMMON_SSE41_H_
--- a/3rdparty/libwebp/src/dsp/cost.c
+++ b/3rdparty/libwebp/src/dsp/cost.c
@ -378,12 +378,7 @@ extern void VP8EncDspCostInitMIPS32(void);
 extern void VP8EncDspCostInitMIPSdspR2(void);
 extern void VP8EncDspCostInitSSE2(void);

-static volatile VP8CPUInfo cost_last_cpuinfo_used =
-    (VP8CPUInfo)&cost_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
-  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
  VP8GetResidualCost = GetResidualCost_C;
  VP8SetResidualCoeffs = SetResidualCoeffs_C;

@ -405,8 +400,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
    }
 #endif
  }
-
-  cost_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/3rdparty/libwebp/src/dsp/dec.c
+++ b/3rdparty/libwebp/src/dsp/dec.c
@ -741,12 +741,7 @@ extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);

-static volatile VP8CPUInfo dec_last_cpuinfo_used =
-    (VP8CPUInfo)&dec_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
-  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8DspInit) {
  VP8InitClipTables();

 #if !WEBP_NEON_OMIT_C_CODE
@ -889,6 +884,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  assert(VP8PredChroma8[5] != NULL);
  assert(VP8PredChroma8[6] != NULL);
  assert(VP8DitherCombine8x8 != NULL);
-
-  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/dsp.h
+++ b/3rdparty/libwebp/src/dsp/dsp.h
@ -141,6 +141,42 @@ extern "C" {
 #endif
 #endif

+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func) do {                                    \
+  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
+      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
+  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
+  if (pthread_mutex_lock(&func ## _lock)) break;                    \
+  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
+  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
+  (void)pthread_mutex_unlock(&func ## _lock);                       \
+} while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func) do {                                    \
+  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
+      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
+  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
+  func();                                                           \
+  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
+} while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                             \
+  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
+    WEBP_DSP_INIT(name ## _body);                            \
+  }                                                          \
+  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
+
 #define WEBP_UBSAN_IGNORE_UNDEF
 #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
 #if defined(__clang__) && defined(__has_attribute)
@ -166,6 +202,13 @@ extern "C" {
 #define WEBP_SWAP_16BIT_CSP 0
 #endif

+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
 typedef enum {
  kSSE2,
  kSSE3,
@ -189,7 +232,7 @@ WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 // avoiding a compiler warning.
 #define WEBP_DSP_INIT_STUB(func) \
  extern void func(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
+  void func(void) {}

 //------------------------------------------------------------------------------
 // Encoding
@ -578,6 +621,13 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
                   int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);

+#ifdef WORDS_BIGENDIAN
+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
+                            const uint8_t* g, const uint8_t* b, int len,
+                            uint32_t* out);
+#endif
+
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
 extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                           int len, int step, uint32_t* out);
--- a/3rdparty/libwebp/src/dsp/enc.c
+++ b/3rdparty/libwebp/src/dsp/enc.c
@ -740,12 +740,7 @@ extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
 extern void VP8EncDspInitMSA(void);

-static volatile VP8CPUInfo enc_last_cpuinfo_used =
-    (VP8CPUInfo)&enc_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
-  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
  VP8DspInit();  // common inverse transforms
  InitTables();

@ -838,6 +833,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  assert(VP8EncQuantizeBlockWHT != NULL);
  assert(VP8Copy4x4 != NULL);
  assert(VP8Copy16x8 != NULL);
-
-  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/filters.c
+++ b/3rdparty/libwebp/src/dsp/filters.c
@ -238,12 +238,7 @@ extern void VP8FiltersInitMSA(void);
 extern void VP8FiltersInitNEON(void);
 extern void VP8FiltersInitSSE2(void);

-static volatile VP8CPUInfo filters_last_cpuinfo_used =
-    (VP8CPUInfo)&filters_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
-  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
 #if !WEBP_NEON_OMIT_C_CODE
  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
@ -289,6 +284,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
-
-  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/lossless.c
+++ b/3rdparty/libwebp/src/dsp/lossless.c
@ -577,9 +577,6 @@ extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);

-static volatile VP8CPUInfo lossless_last_cpuinfo_used =
-    (VP8CPUInfo)&lossless_last_cpuinfo_used;
-
 #define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
  (OUT)[0] = IN##0_C;                                     \
  (OUT)[1] = IN##1_C;                                     \
@ -599,9 +596,7 @@ static volatile VP8CPUInfo lossless_last_cpuinfo_used =
  (OUT)[15] = IN##0_C;                                    \
 } while (0);

-WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
-  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8LDspInit) {
  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
@ -658,8 +653,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
  assert(VP8LConvertBGRAToRGB565 != NULL);
  assert(VP8LMapColor32b != NULL);
  assert(VP8LMapColor8b != NULL);
-
-  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY

--- a/3rdparty/libwebp/src/dsp/lossless.h
+++ b/3rdparty/libwebp/src/dsp/lossless.h
@ -25,10 +25,6 @@
 extern "C" {
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/enc/delta_palettization_enc.h"
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 //------------------------------------------------------------------------------
 // Decoding

--- a/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc.c
@ -863,12 +863,7 @@ extern void VP8LEncDspInitMIPS32(void);
 extern void VP8LEncDspInitMIPSdspR2(void);
 extern void VP8LEncDspInitMSA(void);

-static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
-    (VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
-  if (lossless_enc_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
  VP8LDspInit();

 #if !WEBP_NEON_OMIT_C_CODE
@ -1011,8 +1006,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
  assert(VP8LPredictorsSub_C[13] != NULL);
  assert(VP8LPredictorsSub_C[14] != NULL);
  assert(VP8LPredictorsSub_C[15] != NULL);
-
-  lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@ -46,16 +46,14 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform

+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
 static void TransformColor_SSE2(const VP8LMultipliers* const m,
                                uint32_t* argb_data, int num_pixels) {
-  const __m128i mults_rb = _mm_set_epi16(
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
+  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+                                     CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
  int i;
@ -85,12 +83,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
                                            int tile_width, int tile_height,
                                            int green_to_blue, int red_to_blue,
                                            int histo[]) {
-  const __m128i mults_r = _mm_set_epi16(
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
+  const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
  int y;
@ -135,9 +129,7 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
 static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
                                           int tile_width, int tile_height,
                                           int green_to_red, int histo[]) {
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask = _mm_set1_epi32(0xff);

@ -174,6 +166,7 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
  }
 }
 #undef SPAN
+#undef MK_CST_16

 //------------------------------------------------------------------------------

--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@ -18,6 +18,9 @@
 #include <smmintrin.h>
 #include "src/dsp/lossless.h"

+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform

@ -38,6 +41,95 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
  }
 }

+//------------------------------------------------------------------------------
+// Color Transform
+
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+                                             int tile_width, int tile_height,
+                                             int green_to_blue, int red_to_blue,
+                                             int histo[]) {
+  const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi16(0xff00);   // green mask
+  const __m128i mask_gb = _mm_set1_epi32(0xffff);  // green/blue mask
+  const __m128i mask_b = _mm_set1_epi16(0x00ff);   // blue mask
+  const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
+                                            -1, -1, -1, -1, -1, -1, -1);
+  const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                            2, -1, 6, -1, 10, -1, 14);
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
+      const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
+      const __m128i r = _mm_or_si128(r0, r1);         // r 0
+      const __m128i gb0 = _mm_and_si128(in0, mask_gb);
+      const __m128i gb1 = _mm_and_si128(in1, mask_gb);
+      const __m128i gb = _mm_packus_epi32(gb0, gb1);  // g b
+      const __m128i g = _mm_and_si128(gb, mask_g);    // g 0
+      const __m128i A = _mm_mulhi_epi16(r, mults_r);  // x dbr
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dbg
+      const __m128i C = _mm_sub_epi8(gb, B);          // x b'
+      const __m128i D = _mm_sub_epi8(C, A);           // x b''
+      const __m128i E = _mm_and_si128(D, mask_b);     // 0 b''
+      _mm_storeu_si128((__m128i*)values, E);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_red, int histo[]) {
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi16(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i g0 = _mm_and_si128(in0, mask_g);  // 0 0  | g 0
+      const __m128i g1 = _mm_and_si128(in1, mask_g);
+      const __m128i g = _mm_packus_epi32(g0, g1);     // g 0
+      const __m128i A0 = _mm_srli_epi32(in0, 16);     // 0 0  | x r
+      const __m128i A1 = _mm_srli_epi32(in1, 16);
+      const __m128i A = _mm_packus_epi32(A0, A1);     // x r
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dr
+      const __m128i C = _mm_sub_epi8(A, B);           // x r'
+      const __m128i D = _mm_and_si128(C, mask);       // 0 r'
+      _mm_storeu_si128((__m128i*)values, D);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height, green_to_red,
+                                      histo);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point

@ -45,6 +137,8 @@ extern void VP8LEncDspInitSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_sse2.c
@ -453,14 +453,11 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
-  const __m128i mults_rb = _mm_set_epi16(
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0,
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0);
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+  const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
+#undef MK_CST_16
 #undef CST
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  int i;
@ -503,11 +500,11 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
    __m128i in5 = _mm_loadu_si128(in + 5);
    __m128i in6 = _mm_loadu_si128(in + 6);
    __m128i in7 = _mm_loadu_si128(in + 7);
-    VP8L32bToPlanar(&in0, &in1, &in2, &in3);
-    VP8L32bToPlanar(&in4, &in5, &in6, &in7);
+    VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
+    VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
    // At this points, in1/in5 contains red only, in2/in6 green only ...
    // Pack the colors in 24b RGB.
-    VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
+    VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
    _mm_storeu_si128(out + 0, in1);
    _mm_storeu_si128(out + 1, in5);
    _mm_storeu_si128(out + 2, in2);
--- a/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/3rdparty/libwebp/src/dsp/rescaler.c
@ -204,11 +204,7 @@ extern void WebPRescalerDspInitMIPSdspR2(void);
 extern void WebPRescalerDspInitMSA(void);
 extern void WebPRescalerDspInitNEON(void);

-static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
-    (VP8CPUInfo)&rescaler_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
-  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #if !defined(WEBP_REDUCE_SIZE)
 #if !WEBP_NEON_OMIT_C_CODE
  WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
@ -253,5 +249,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
  assert(WebPRescalerImportRowExpand != NULL);
  assert(WebPRescalerImportRowShrink != NULL);
 #endif   // WEBP_REDUCE_SIZE
-  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@ -36,7 +36,7 @@ static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
 }

 // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
-static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
+static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
  *out = _mm_unpacklo_epi8(A, zero);
@ -50,13 +50,15 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
  int accum = x_add;
  __m128i cur_pixels;

+  // SSE2 implementation only works with 16b signed arithmetic at max.
+  if (wrk->src_width < 8 || accum >= (1 << 15)) {
+    WebPRescalerImportRowExpand_C(wrk, src);
+    return;
+  }
+
  assert(!WebPRescalerInputDone(wrk));
  assert(wrk->x_expand);
  if (wrk->num_channels == 4) {
-    if (wrk->src_width < 2) {
-      WebPRescalerImportRowExpand_C(wrk, src);
-      return;
-    }
    LoadTwoPixels_SSE2(src, &cur_pixels);
    src += 4;
    while (1) {
@ -75,11 +77,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
  } else {
    int left;
    const uint8_t* const src_limit = src + wrk->src_width - 8;
-    if (wrk->src_width < 8) {
-      WebPRescalerImportRowExpand_C(wrk, src);
-      return;
-    }
-    LoadHeightPixels_SSE2(src, &cur_pixels);
+    LoadEightPixels_SSE2(src, &cur_pixels);
    src += 7;
    left = 7;
    while (1) {
@ -94,7 +92,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
        if (--left) {
          cur_pixels = _mm_srli_si128(cur_pixels, 2);
        } else if (src <= src_limit) {
-          LoadHeightPixels_SSE2(src, &cur_pixels);
+          LoadEightPixels_SSE2(src, &cur_pixels);
          src += 7;
          left = 7;
        } else {   // tail
--- a/3rdparty/libwebp/src/dsp/ssim.c
+++ b/3rdparty/libwebp/src/dsp/ssim.c
@ -139,12 +139,7 @@ VP8AccumulateSSEFunc VP8AccumulateSSE;

 extern void VP8SSIMDspInitSSE2(void);

-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
 #if !defined(WEBP_REDUCE_SIZE)
  VP8SSIMGetClipped = SSIMGetClipped_C;
  VP8SSIMGet = SSIMGet_C;
@ -161,6 +156,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
    }
 #endif
  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/3rdparty/libwebp/src/dsp/upsampling.c
@ -217,13 +217,9 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST];

 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 extern void WebPInitYUV444ConvertersSSE2(void);
+extern void WebPInitYUV444ConvertersSSE41(void);

-static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
-    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
-  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
@ -242,29 +238,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
      WebPInitYUV444ConvertersSSE2();
    }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitYUV444ConvertersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitYUV444ConvertersMIPSdspR2();
    }
 #endif
  }
-  upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
 // Main calls

 extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersSSE41(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
 extern void WebPInitUpsamplersMSA(void);

-static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
-    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
-  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #ifdef FANCY_UPSAMPLING
 #if !WEBP_NEON_OMIT_C_CODE
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
@ -287,6 +283,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
      WebPInitUpsamplersSSE2();
    }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitUpsamplersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitUpsamplersMIPSdspR2();
@ -310,6 +311,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  assert(WebPUpsamplers[MODE_BGRA] != NULL);
  assert(WebPUpsamplers[MODE_rgbA] != NULL);
  assert(WebPUpsamplers[MODE_bgrA] != NULL);
+#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE
  assert(WebPUpsamplers[MODE_RGB] != NULL);
  assert(WebPUpsamplers[MODE_BGR] != NULL);
  assert(WebPUpsamplers[MODE_ARGB] != NULL);
@ -317,9 +319,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  assert(WebPUpsamplers[MODE_RGB_565] != NULL);
  assert(WebPUpsamplers[MODE_Argb] != NULL);
  assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+#endif

 #endif  // FANCY_UPSAMPLING
-  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/3rdparty/libwebp/src/dsp/upsampling_msa.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_msa.c
@ -264,6 +264,7 @@ static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
  bgr[2] = Clip8(r1 >> 6);
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
  const int y1 = MultHi(y, 19077);
  const int r1 = y1 + MultHi(v, 26149) - 14234;
@ -306,6 +307,7 @@ static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
  argb[0] = 0xff;
  YuvToRgb(y, u, v, argb + 1);
 }
+#endif  // WEBP_REDUCE_CSP

 static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
  YuvToBgr(y, u, v, bgra);
@ -317,6 +319,7 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
  rgba[3] = 0xff;
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
                         const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
@ -370,6 +373,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
    memcpy(dst, temp, length * 3 * sizeof(*dst));
  }
 }
+#endif  // WEBP_REDUCE_CSP

 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
@ -427,6 +431,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
  }
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
@ -526,6 +531,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
    memcpy(dst, temp, length * 2 * sizeof(*dst));
  }
 }
+#endif  // WEBP_REDUCE_CSP

 #define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
  v16u8 s = __msa_aver_u_b(a, d);             \
--- a/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@ -104,21 +104,6 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
  Upsample32Pixels_SSE2(r1, r2, out);                                          \
 }

-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
-                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
-  int n;                                                                       \
-  for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n],                                 \
-         (top_dst) + ((cur_x) + n) * (XSTEP));                                 \
-  }                                                                            \
-  if ((bottom_y) != NULL) {                                                    \
-    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n],                  \
-           (bottom_dst) + ((cur_x) + n) * (XSTEP));                            \
-    }                                                                          \
-  }                                                                            \
-}
-
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                       top_dst, bottom_dst, cur_x) do {                        \
  FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));   \
@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
  int uv_pos, pos;                                                             \
  /* 16byte-aligned array to cache reconstructed u and v */                    \
-  uint8_t uv_buf[4 * 32 + 15];                                                 \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
  uint8_t* const r_v = r_u + 32;                                               \
                                                                               \
@ -160,11 +145,22 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
  }                                                                            \
  if (len > 1) {                                                               \
    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
    assert(left_over > 0);                                                     \
    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
-                pos, len - pos);                                               \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
  }                                                                            \
 }

--- a/3rdparty/libwebp/src/dsp/upsampling_sse41.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_sse41.c
@ -0,0 +1,239 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 version of YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <assert.h>
+#include <smmintrin.h>
+#include <string.h>
+#include "src/dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#if !defined(WEBP_REDUCE_CSP)
+
+// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+// u = (9*a + 3*b + 3*c + d + 8) / 16
+//   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+//   = (a + m + 1) / 2
+// where m = (a + 3*b + 3*c + d) / 8
+//         = ((a + b + c + d) / 2 + b + c) / 4
+//
+// Let's say  k = (a + b + c + d) / 4.
+// We can compute k as
+// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+//
+// Then m can be written as
+// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+
+// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+#define GET_M(ij, in, out) do {                                                \
+  const __m128i tmp0 = _mm_avg_epu8(k, (in));     /* (k + in + 1) / 2 */       \
+  const __m128i tmp1 = _mm_and_si128((ij), st);   /* (ij) & (s^t) */           \
+  const __m128i tmp2 = _mm_xor_si128(k, (in));    /* (k^in) */                 \
+  const __m128i tmp3 = _mm_or_si128(tmp1, tmp2);  /* ((ij) & (s^t)) | (k^in) */\
+  const __m128i tmp4 = _mm_and_si128(tmp3, one);  /* & 1 -> lsb_correction */  \
+  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
+} while (0)
+
+// pack and store two alternating pixel rows
+#define PACK_AND_STORE(a, b, da, db, out) do {                                 \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
+} while (0)
+
+// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+  const __m128i one = _mm_set1_epi8(1);                                        \
+  const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
+  const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
+  const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]);                 \
+  const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]);                 \
+                                                                               \
+  const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
+  const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
+  const __m128i st = _mm_xor_si128(s, t);      /* st = s^t */                  \
+                                                                               \
+  const __m128i ad = _mm_xor_si128(a, d);      /* ad = a^d */                  \
+  const __m128i bc = _mm_xor_si128(b, c);      /* bc = b^c */                  \
+                                                                               \
+  const __m128i t1 = _mm_or_si128(ad, bc);     /* (a^d) | (b^c) */             \
+  const __m128i t2 = _mm_or_si128(t1, st);     /* (a^d) | (b^c) | (s^t) */     \
+  const __m128i t3 = _mm_and_si128(t2, one);   /* (a^d) | (b^c) | (s^t) & 1 */ \
+  const __m128i t4 = _mm_avg_epu8(s, t);                                       \
+  const __m128i k = _mm_sub_epi8(t4, t3);      /* k = (a + b + c + d) / 4 */   \
+  __m128i diag1, diag2;                                                        \
+                                                                               \
+  GET_M(bc, t, diag1);                  /* diag1 = (a + 3b + 3c + d) / 8 */    \
+  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
+                                                                               \
+  /* pack the alternate pixels */                                              \
+  PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
+  PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
+                                  uint8_t* const out) {
+  UPSAMPLE_32PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                         \
+  uint8_t r1[17], r2[17];                                                      \
+  memcpy(r1, (tb), (num_pixels));                                              \
+  memcpy(r2, (bb), (num_pixels));                                              \
+  /* replicate last byte */                                                    \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
+  /* using the shared function instead of the macro saves ~3k code size */     \
+  Upsample32Pixels_SSE41(r1, r2, out);                                         \
+}
+
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));  \
+  if ((bottom_y) != NULL) {                                                    \
+    FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64,                   \
+                  (bottom_dst) + (cur_x) * (XSTEP));                           \
+  }                                                                            \
+} while (0)
+
+#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
+                                                                               \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
+  }                                                                            \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  }                                                                            \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
+  }                                                                            \
+}
+
+// SSE4 variants of the fancy upsampler.
+SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41,  VP8YuvToRgb,  3)
+SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41,  VP8YuvToBgr,  3)
+
+#undef GET_M
+#undef PACK_AND_STORE
+#undef UPSAMPLE_32PIXELS
+#undef UPSAMPLE_LAST_BLOCK
+#undef CONVERT2RGB
+#undef CONVERT2RGB_32
+#undef SSE4_UPSAMPLE_FUNC
+
+#endif   // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_SSE41;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern void WebPInitYUV444ConvertersSSE41(void);
+
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  const int max_len = len & ~31;                                               \
+  for (i = 0; i < max_len; i += 32) {                                          \
+    CALL(y + i, u + i, v + i, dst + i * (XSTEP));                              \
+  }                                                                            \
+  if (i < len) {  /* C-fallback */                                             \
+    CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i);                   \
+  }                                                                            \
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
+#endif  // WEBP_REDUCE_CSP
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE41;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#else
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41)
+
+#endif  // WEBP_USE_SSE41
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41)
+#endif
--- a/3rdparty/libwebp/src/dsp/yuv.c
+++ b/3rdparty/libwebp/src/dsp/yuv.c
@ -71,15 +71,11 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];

 extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersSSE41(void);
 extern void WebPInitSamplersMIPS32(void);
 extern void WebPInitSamplersMIPSdspR2(void);

-static volatile VP8CPUInfo yuv_last_cpuinfo_used =
-    (VP8CPUInfo)&yuv_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
-  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
@ -99,6 +95,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
      WebPInitSamplersSSE2();
    }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitSamplersSSE41();
+    }
+#endif  // WEBP_USE_SSE41
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPInitSamplersMIPS32();
@ -110,7 +111,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
    }
 #endif  // WEBP_USE_MIPS_DSP_R2
  }
-  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //-----------------------------------------------------------------------------
@ -254,17 +254,13 @@ void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
 void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
                              const uint16_t* best_y, uint16_t* out);

-static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
-    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
-
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
 extern void WebPInitSharpYUVNEON(void);

-WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
-  if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
  WebPConvertARGBToY = ConvertARGBToY_C;
  WebPConvertARGBToUV = WebPConvertARGBToUV_C;

@ -286,6 +282,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitConvertARGBToYUVSSE41();
+    }
+#endif  // WEBP_USE_SSE41
  }

 #if defined(WEBP_USE_NEON)
@ -304,6 +305,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
  assert(WebPSharpYUVUpdateY != NULL);
  assert(WebPSharpYUVUpdateRGB != NULL);
  assert(WebPSharpYUVFilterRow != NULL);
-
-  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/3rdparty/libwebp/src/dsp/yuv.h
+++ b/3rdparty/libwebp/src/dsp/yuv.h
@ -166,6 +166,19 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,

 #endif    // WEBP_USE_SSE2

+//-----------------------------------------------------------------------------
+// SSE41 extra functions (mostly for upsampling_sse41.c)
+
+#if defined(WEBP_USE_SSE41)
+
+// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+
+#endif    // WEBP_USE_SSE41
+
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion

--- a/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/3rdparty/libwebp/src/dsp/yuv_sse2.c
@ -180,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
  // Repeat the same permutations twice more:
  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
-  VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
+  VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);

  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
@ -492,7 +492,7 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
  __m128i a1 = LOAD_16(argb + 4);
  __m128i a2 = LOAD_16(argb + 8);
  __m128i a3 = LOAD_16(argb + 12);
-  VP8L32bToPlanar(&a0, &a1, &a2, &a3);
+  VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
  rgb[0] = _mm_unpacklo_epi8(a1, zero);
  rgb[1] = _mm_unpackhi_epi8(a1, zero);
  rgb[2] = _mm_unpacklo_epi8(a2, zero);
--- a/3rdparty/libwebp/src/dsp/yuv_sse41.c
+++ b/3rdparty/libwebp/src/dsp/yuv_sse41.c
@ -0,0 +1,613 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include <stdlib.h>
+#include <smmintrin.h>
+
+//-----------------------------------------------------------------------------
+// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+// R = (19077 * y             + 26149 * v - 14234) >> 6
+// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+// B = (19077 * y + 33050 * u             - 17685) >> 6
+static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
+                                     const __m128i* const U0,
+                                     const __m128i* const V0,
+                                     __m128i* const R,
+                                     __m128i* const G,
+                                     __m128i* const B) {
+  const __m128i k19077 = _mm_set1_epi16(19077);
+  const __m128i k26149 = _mm_set1_epi16(26149);
+  const __m128i k14234 = _mm_set1_epi16(14234);
+  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+  const __m128i k33050 = _mm_set1_epi16((short)33050);
+  const __m128i k17685 = _mm_set1_epi16(17685);
+  const __m128i k6419  = _mm_set1_epi16(6419);
+  const __m128i k13320 = _mm_set1_epi16(13320);
+  const __m128i k8708  = _mm_set1_epi16(8708);
+
+  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
+
+  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
+  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
+  const __m128i R2 = _mm_add_epi16(R1, R0);
+
+  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
+  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
+  const __m128i G2 = _mm_add_epi16(Y1, k8708);
+  const __m128i G3 = _mm_add_epi16(G0, G1);
+  const __m128i G4 = _mm_sub_epi16(G2, G3);
+
+  // be careful with the saturated *unsigned* arithmetic here!
+  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
+  const __m128i B1 = _mm_adds_epu16(B0, Y1);
+  const __m128i B2 = _mm_subs_epu16(B1, k17685);
+
+  // use logical shift for B2, which can be larger than 32767
+  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
+  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
+  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
+}
+
+// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
+}
+
+// Load and replicate the U/V samples
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
+  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
+}
+
+// Convert 32 samples of YUV444 to R/G/B
+static void YUV444ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
+                V0 = Load_HI_16_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Convert 32 samples of YUV420 to R/G/B
+static void YUV420ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
+                V0 = Load_UV_HI_8_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5,
+    uint8_t* const rgb) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
+  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
+  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
+  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
+  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
+  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
+}
+
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+  YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as RRRRGGGGBBBB.
+  rgb0 = _mm_packus_epi16(R0, R1);
+  rgb1 = _mm_packus_epi16(R2, R3);
+  rgb2 = _mm_packus_epi16(G0, G1);
+  rgb3 = _mm_packus_epi16(G2, G3);
+  rgb4 = _mm_packus_epi16(B0, B1);
+  rgb5 = _mm_packus_epi16(B2, B3);
+
+  // Pack as RGBRGBRGBRGB.
+  PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+}
+
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+  YUV444ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as BBBBGGGGRRRR.
+  bgr0 = _mm_packus_epi16(B0, B1);
+  bgr1 = _mm_packus_epi16(B2, B3);
+  bgr2 = _mm_packus_epi16(G0, G1);
+  bgr3 = _mm_packus_epi16(G2, G3);
+  bgr4 = _mm_packus_epi16(R0, R1);
+  bgr5= _mm_packus_epi16(R2, R3);
+
+  // Pack as BGRBGRBGRBGR.
+  PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+}
+
+//-----------------------------------------------------------------------------
+// Arbitrary-length row conversion functions
+
+static void YuvToRgbRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as RRRRGGGGBBBB.
+    rgb0 = _mm_packus_epi16(R0, R1);
+    rgb1 = _mm_packus_epi16(R2, R3);
+    rgb2 = _mm_packus_epi16(G0, G1);
+    rgb3 = _mm_packus_epi16(G2, G3);
+    rgb4 = _mm_packus_epi16(B0, B1);
+    rgb5 = _mm_packus_epi16(B2, B3);
+
+    // Pack as RGBRGBRGBRGB.
+    PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToRgb(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+static void YuvToBgrRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as BBBBGGGGRRRR.
+    bgr0 = _mm_packus_epi16(B0, B1);
+    bgr1 = _mm_packus_epi16(B2, B3);
+    bgr2 = _mm_packus_epi16(G0, G1);
+    bgr3 = _mm_packus_epi16(G2, G3);
+    bgr4 = _mm_packus_epi16(R0, R1);
+    bgr5 = _mm_packus_epi16(R2, R3);
+
+    // Pack as BGRBGRBGRBGR.
+    PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToBgr(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE41;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE41;
+}
+
+//------------------------------------------------------------------------------
+// RGB24/32 -> YUV converters
+
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
+// Store either 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
+
+#define WEBP_SSE41_SHUFF(OUT)  do {                  \
+  const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
+  const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
+  const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
+  const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
+  const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
+  const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
+                                                     \
+  /* OR everything to get one channel */             \
+  const __m128i tmp6 = _mm_or_si128(tmp0, tmp1);     \
+  const __m128i tmp7 = _mm_or_si128(tmp3, tmp4);     \
+  out[OUT + 0] = _mm_or_si128(tmp6, tmp2);           \
+  out[OUT + 1] = _mm_or_si128(tmp7, tmp5);           \
+} while (0);
+
+// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// Similar to PlanarTo24bHelper(), but in reverse order.
+static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
+    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
+  const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
+  const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
+  const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
+  const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
+  const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
+
+  // Compute RR.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(0)
+  }
+  // Compute GG.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(2)
+  }
+  // Compute BB.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(4)
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert 8 packed ARGB to r[], g[], b[]
+static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
+    const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i a0 = LOAD_16(argb + 0);
+  __m128i a1 = LOAD_16(argb + 4);
+  __m128i a2 = LOAD_16(argb + 8);
+  __m128i a3 = LOAD_16(argb + 12);
+  VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
+  rgb[0] = _mm_unpacklo_epi8(a1, zero);
+  rgb[1] = _mm_unpackhi_epi8(a1, zero);
+  rgb[2] = _mm_unpacklo_epi8(a2, zero);
+  rgb[3] = _mm_unpackhi_epi8(a2, zero);
+  rgb[4] = _mm_unpacklo_epi8(a3, zero);
+  rgb[5] = _mm_unpackhi_epi8(a3, zero);
+}
+
+// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
+// It's a macro and not a function because we need to use immediate values with
+// srai_epi32, e.g.
+#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
+                  ROUNDER, DESCALE_FIX, OUT) do {               \
+  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
+  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
+  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
+  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
+  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
+  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
+  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
+  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
+  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
+  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
+  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
+} while (0)
+
+#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
+static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
+                                            const __m128i* const G,
+                                            const __m128i* const B,
+                                            __m128i* const Y) {
+  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
+  const __m128i kGB_y = MK_CST_16(16384, 6420);
+  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
+}
+
+static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
+                                             const __m128i* const G,
+                                             const __m128i* const B,
+                                             __m128i* const U,
+                                             __m128i* const V) {
+  const __m128i kRG_u = MK_CST_16(-9719, -19081);
+  const __m128i kGB_u = MK_CST_16(0, 28800);
+  const __m128i kRG_v = MK_CST_16(28800, 0);
+  const __m128i kGB_v = MK_CST_16(-24116, -4684);
+  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
+            kHALF_UV, YUV_FIX + 2, *U);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
+            kHALF_UV, YUV_FIX + 2, *V);
+}
+
+#undef MK_CST_16
+#undef TRANSFORM
+
+static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
+    __m128i rgb_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
+    __m128i bgr_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
+  const int max_width = width & ~15;
+  int i;
+  for (i = 0; i < max_width; i += 16) {
+    __m128i Y0, Y1, rgb[6];
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
+    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+// Horizontal add (doubled) of two 16b values, result is 16b.
+// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
+static void HorizontalAddPack_SSE41(const __m128i* const A,
+                                    const __m128i* const B,
+                                    __m128i* const out) {
+  const __m128i k2 = _mm_set1_epi16(2);
+  const __m128i C = _mm_madd_epi16(*A, k2);
+  const __m128i D = _mm_madd_epi16(*B, k2);
+  *out = _mm_packs_epi32(C, D);
+}
+
+static void ConvertARGBToUV_SSE41(const uint32_t* argb,
+                                  uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store) {
+  const int max_width = src_width & ~31;
+  int i;
+  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
+    __m128i rgb[6], U0, V0, U1, V1;
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+
+    U0 = _mm_packus_epi16(U0, U1);
+    V0 = _mm_packus_epi16(V0, V1);
+    if (!do_store) {
+      const __m128i prev_u = LOAD_16(u);
+      const __m128i prev_v = LOAD_16(v);
+      U0 = _mm_avg_epu8(U0, prev_u);
+      V0 = _mm_avg_epu8(V0, prev_v);
+    }
+    STORE_16(U0, u);
+    STORE_16(V0, v);
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+// Convert 16 packed ARGB 16b-values to r[], g[], b[]
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
+    const uint16_t* const rgbx,
+    __m128i* const r, __m128i* const g, __m128i* const b) {
+  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
+  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
+  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
+  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
+  // aarrggbb as 16-bit.
+  const __m128i shuff0 =
+      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i shuff1 =
+      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
+  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
+  // R0R1G0G1
+  // B0B1****
+  // R2R3G2G3
+  // B2B3****
+  // (OR is used to free port 5 for the unpack)
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_or_si128(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_or_si128(A2, A3);
+  // Gather the channels.
+  *r = _mm_unpacklo_epi64(B0, B2);
+  *g = _mm_unpackhi_epi64(B0, B2);
+  *b = _mm_unpackhi_epi64(B1, B3);
+}
+
+static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width) {
+  const int max_width = width & ~15;
+  const uint16_t* const last_rgb = rgb + 4 * max_width;
+  while (rgb < last_rgb) {
+    __m128i r, g, b, U0, V0, U1, V1;
+    RGBA32PackedToPlanar_16b_SSE41(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
+    STORE_16(_mm_packus_epi16(U0, U1), u);
+    STORE_16(_mm_packus_epi16(V0, V1), v);
+    u += 16;
+    v += 16;
+    rgb += 2 * 32;
+  }
+  if (max_width < width) {  // left-over
+    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
+  WebPConvertARGBToY = ConvertARGBToY_SSE41;
+  WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
+
+  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
+
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
+}
+
+//------------------------------------------------------------------------------
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/3rdparty/libwebp/src/enc/alpha_enc.c
+++ b/3rdparty/libwebp/src/enc/alpha_enc.c
@ -361,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls

-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* dummy) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
  const WebPConfig* config = enc->config_;
  uint8_t* alpha_data = NULL;
  size_t alpha_size = 0;
@ -394,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
    WebPGetWorkerInterface()->Init(worker);
    worker->data1 = enc;
    worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+    worker->hook = CompressAlphaJob;
  }
 }

--- a/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/3rdparty/libwebp/src/enc/analysis_enc.c
@ -434,7 +434,9 @@ typedef struct {
 } SegmentJob;

 // main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
  int ok = 1;
  if (!VP8IteratorIsDone(it)) {
    uint8_t tmp[32 + WEBP_ALIGN_CST];
@ -462,7 +464,7 @@ static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
  WebPGetWorkerInterface()->Init(&job->worker);
  job->worker.data1 = job;
  job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  job->worker.hook = DoSegmentsJob;
  VP8IteratorInit(enc, &job->it);
  VP8IteratorSetRow(&job->it, start_row);
  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
--- a/3rdparty/libwebp/src/enc/delta_palettization_enc.c
+++ b/3rdparty/libwebp/src/enc/delta_palettization_enc.c
@ -1,455 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#include "src/enc/delta_palettization_enc.h"
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/webp/types.h"
-#include "src/dsp/lossless.h"
-
-#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
-
-// Format allows palette up to 256 entries, but more palette entries produce
-// bigger entropy. In the future it will probably be useful to add more entries
-// that are far from the origin of the palette or choose remaining entries
-// dynamically.
-#define DELTA_PALETTE_SIZE 226
-
-// Palette used for delta_palettization. Entries are roughly sorted by distance
-// of their signed equivalents from the origin.
-static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
-  MK_COL(0u, 0u, 0u),
-  MK_COL(255u, 255u, 255u),
-  MK_COL(1u, 1u, 1u),
-  MK_COL(254u, 254u, 254u),
-  MK_COL(2u, 2u, 2u),
-  MK_COL(4u, 4u, 4u),
-  MK_COL(252u, 252u, 252u),
-  MK_COL(250u, 0u, 0u),
-  MK_COL(0u, 250u, 0u),
-  MK_COL(0u, 0u, 250u),
-  MK_COL(6u, 0u, 0u),
-  MK_COL(0u, 6u, 0u),
-  MK_COL(0u, 0u, 6u),
-  MK_COL(0u, 0u, 248u),
-  MK_COL(0u, 0u, 8u),
-  MK_COL(0u, 248u, 0u),
-  MK_COL(0u, 248u, 248u),
-  MK_COL(0u, 248u, 8u),
-  MK_COL(0u, 8u, 0u),
-  MK_COL(0u, 8u, 248u),
-  MK_COL(0u, 8u, 8u),
-  MK_COL(8u, 8u, 8u),
-  MK_COL(248u, 0u, 0u),
-  MK_COL(248u, 0u, 248u),
-  MK_COL(248u, 0u, 8u),
-  MK_COL(248u, 248u, 0u),
-  MK_COL(248u, 8u, 0u),
-  MK_COL(8u, 0u, 0u),
-  MK_COL(8u, 0u, 248u),
-  MK_COL(8u, 0u, 8u),
-  MK_COL(8u, 248u, 0u),
-  MK_COL(8u, 8u, 0u),
-  MK_COL(23u, 23u, 23u),
-  MK_COL(13u, 13u, 13u),
-  MK_COL(232u, 232u, 232u),
-  MK_COL(244u, 244u, 244u),
-  MK_COL(245u, 245u, 250u),
-  MK_COL(50u, 50u, 50u),
-  MK_COL(204u, 204u, 204u),
-  MK_COL(236u, 236u, 236u),
-  MK_COL(16u, 16u, 16u),
-  MK_COL(240u, 16u, 16u),
-  MK_COL(16u, 240u, 16u),
-  MK_COL(240u, 240u, 16u),
-  MK_COL(16u, 16u, 240u),
-  MK_COL(240u, 16u, 240u),
-  MK_COL(16u, 240u, 240u),
-  MK_COL(240u, 240u, 240u),
-  MK_COL(0u, 0u, 232u),
-  MK_COL(0u, 232u, 0u),
-  MK_COL(232u, 0u, 0u),
-  MK_COL(0u, 0u, 24u),
-  MK_COL(0u, 24u, 0u),
-  MK_COL(24u, 0u, 0u),
-  MK_COL(32u, 32u, 32u),
-  MK_COL(224u, 32u, 32u),
-  MK_COL(32u, 224u, 32u),
-  MK_COL(224u, 224u, 32u),
-  MK_COL(32u, 32u, 224u),
-  MK_COL(224u, 32u, 224u),
-  MK_COL(32u, 224u, 224u),
-  MK_COL(224u, 224u, 224u),
-  MK_COL(0u, 0u, 176u),
-  MK_COL(0u, 0u, 80u),
-  MK_COL(0u, 176u, 0u),
-  MK_COL(0u, 176u, 176u),
-  MK_COL(0u, 176u, 80u),
-  MK_COL(0u, 80u, 0u),
-  MK_COL(0u, 80u, 176u),
-  MK_COL(0u, 80u, 80u),
-  MK_COL(176u, 0u, 0u),
-  MK_COL(176u, 0u, 176u),
-  MK_COL(176u, 0u, 80u),
-  MK_COL(176u, 176u, 0u),
-  MK_COL(176u, 80u, 0u),
-  MK_COL(80u, 0u, 0u),
-  MK_COL(80u, 0u, 176u),
-  MK_COL(80u, 0u, 80u),
-  MK_COL(80u, 176u, 0u),
-  MK_COL(80u, 80u, 0u),
-  MK_COL(0u, 0u, 152u),
-  MK_COL(0u, 0u, 104u),
-  MK_COL(0u, 152u, 0u),
-  MK_COL(0u, 152u, 152u),
-  MK_COL(0u, 152u, 104u),
-  MK_COL(0u, 104u, 0u),
-  MK_COL(0u, 104u, 152u),
-  MK_COL(0u, 104u, 104u),
-  MK_COL(152u, 0u, 0u),
-  MK_COL(152u, 0u, 152u),
-  MK_COL(152u, 0u, 104u),
-  MK_COL(152u, 152u, 0u),
-  MK_COL(152u, 104u, 0u),
-  MK_COL(104u, 0u, 0u),
-  MK_COL(104u, 0u, 152u),
-  MK_COL(104u, 0u, 104u),
-  MK_COL(104u, 152u, 0u),
-  MK_COL(104u, 104u, 0u),
-  MK_COL(216u, 216u, 216u),
-  MK_COL(216u, 216u, 40u),
-  MK_COL(216u, 216u, 176u),
-  MK_COL(216u, 216u, 80u),
-  MK_COL(216u, 40u, 216u),
-  MK_COL(216u, 40u, 40u),
-  MK_COL(216u, 40u, 176u),
-  MK_COL(216u, 40u, 80u),
-  MK_COL(216u, 176u, 216u),
-  MK_COL(216u, 176u, 40u),
-  MK_COL(216u, 176u, 176u),
-  MK_COL(216u, 176u, 80u),
-  MK_COL(216u, 80u, 216u),
-  MK_COL(216u, 80u, 40u),
-  MK_COL(216u, 80u, 176u),
-  MK_COL(216u, 80u, 80u),
-  MK_COL(40u, 216u, 216u),
-  MK_COL(40u, 216u, 40u),
-  MK_COL(40u, 216u, 176u),
-  MK_COL(40u, 216u, 80u),
-  MK_COL(40u, 40u, 216u),
-  MK_COL(40u, 40u, 40u),
-  MK_COL(40u, 40u, 176u),
-  MK_COL(40u, 40u, 80u),
-  MK_COL(40u, 176u, 216u),
-  MK_COL(40u, 176u, 40u),
-  MK_COL(40u, 176u, 176u),
-  MK_COL(40u, 176u, 80u),
-  MK_COL(40u, 80u, 216u),
-  MK_COL(40u, 80u, 40u),
-  MK_COL(40u, 80u, 176u),
-  MK_COL(40u, 80u, 80u),
-  MK_COL(80u, 216u, 216u),
-  MK_COL(80u, 216u, 40u),
-  MK_COL(80u, 216u, 176u),
-  MK_COL(80u, 216u, 80u),
-  MK_COL(80u, 40u, 216u),
-  MK_COL(80u, 40u, 40u),
-  MK_COL(80u, 40u, 176u),
-  MK_COL(80u, 40u, 80u),
-  MK_COL(80u, 176u, 216u),
-  MK_COL(80u, 176u, 40u),
-  MK_COL(80u, 176u, 176u),
-  MK_COL(80u, 176u, 80u),
-  MK_COL(80u, 80u, 216u),
-  MK_COL(80u, 80u, 40u),
-  MK_COL(80u, 80u, 176u),
-  MK_COL(80u, 80u, 80u),
-  MK_COL(0u, 0u, 192u),
-  MK_COL(0u, 0u, 64u),
-  MK_COL(0u, 0u, 128u),
-  MK_COL(0u, 192u, 0u),
-  MK_COL(0u, 192u, 192u),
-  MK_COL(0u, 192u, 64u),
-  MK_COL(0u, 192u, 128u),
-  MK_COL(0u, 64u, 0u),
-  MK_COL(0u, 64u, 192u),
-  MK_COL(0u, 64u, 64u),
-  MK_COL(0u, 64u, 128u),
-  MK_COL(0u, 128u, 0u),
-  MK_COL(0u, 128u, 192u),
-  MK_COL(0u, 128u, 64u),
-  MK_COL(0u, 128u, 128u),
-  MK_COL(176u, 216u, 216u),
-  MK_COL(176u, 216u, 40u),
-  MK_COL(176u, 216u, 176u),
-  MK_COL(176u, 216u, 80u),
-  MK_COL(176u, 40u, 216u),
-  MK_COL(176u, 40u, 40u),
-  MK_COL(176u, 40u, 176u),
-  MK_COL(176u, 40u, 80u),
-  MK_COL(176u, 176u, 216u),
-  MK_COL(176u, 176u, 40u),
-  MK_COL(176u, 176u, 176u),
-  MK_COL(176u, 176u, 80u),
-  MK_COL(176u, 80u, 216u),
-  MK_COL(176u, 80u, 40u),
-  MK_COL(176u, 80u, 176u),
-  MK_COL(176u, 80u, 80u),
-  MK_COL(192u, 0u, 0u),
-  MK_COL(192u, 0u, 192u),
-  MK_COL(192u, 0u, 64u),
-  MK_COL(192u, 0u, 128u),
-  MK_COL(192u, 192u, 0u),
-  MK_COL(192u, 192u, 192u),
-  MK_COL(192u, 192u, 64u),
-  MK_COL(192u, 192u, 128u),
-  MK_COL(192u, 64u, 0u),
-  MK_COL(192u, 64u, 192u),
-  MK_COL(192u, 64u, 64u),
-  MK_COL(192u, 64u, 128u),
-  MK_COL(192u, 128u, 0u),
-  MK_COL(192u, 128u, 192u),
-  MK_COL(192u, 128u, 64u),
-  MK_COL(192u, 128u, 128u),
-  MK_COL(64u, 0u, 0u),
-  MK_COL(64u, 0u, 192u),
-  MK_COL(64u, 0u, 64u),
-  MK_COL(64u, 0u, 128u),
-  MK_COL(64u, 192u, 0u),
-  MK_COL(64u, 192u, 192u),
-  MK_COL(64u, 192u, 64u),
-  MK_COL(64u, 192u, 128u),
-  MK_COL(64u, 64u, 0u),
-  MK_COL(64u, 64u, 192u),
-  MK_COL(64u, 64u, 64u),
-  MK_COL(64u, 64u, 128u),
-  MK_COL(64u, 128u, 0u),
-  MK_COL(64u, 128u, 192u),
-  MK_COL(64u, 128u, 64u),
-  MK_COL(64u, 128u, 128u),
-  MK_COL(128u, 0u, 0u),
-  MK_COL(128u, 0u, 192u),
-  MK_COL(128u, 0u, 64u),
-  MK_COL(128u, 0u, 128u),
-  MK_COL(128u, 192u, 0u),
-  MK_COL(128u, 192u, 192u),
-  MK_COL(128u, 192u, 64u),
-  MK_COL(128u, 192u, 128u),
-  MK_COL(128u, 64u, 0u),
-  MK_COL(128u, 64u, 192u),
-  MK_COL(128u, 64u, 64u),
-  MK_COL(128u, 64u, 128u),
-  MK_COL(128u, 128u, 0u),
-  MK_COL(128u, 128u, 192u),
-  MK_COL(128u, 128u, 64u),
-  MK_COL(128u, 128u, 128u),
-};
-
-#undef MK_COL
-
-//------------------------------------------------------------------------------
-// TODO(skal): move the functions to dsp/lossless.c when the correct
-// granularity is found. For now, we'll just copy-paste some useful bits
-// here instead.
-
-// In-place sum of each component with mod 256.
-static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-// Delta palettization functions.
-static WEBP_INLINE int Square(int x) {
-  return x * x;
-}
-
-static WEBP_INLINE uint32_t Intensity(uint32_t a) {
-  return
-      30 * ((a >> 16) & 0xff) +
-      59 * ((a >>  8) & 0xff) +
-      11 * ((a >>  0) & 0xff);
-}
-
-static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
-                         uint32_t palette_entry) {
-  int i;
-  uint32_t distance = 0;
-  AddPixelsEq(&predicted_value, palette_entry);
-  for (i = 0; i < 32; i += 8) {
-    const int32_t av = (actual_value >> i) & 0xff;
-    const int32_t pv = (predicted_value >> i) & 0xff;
-    distance += Square(pv - av);
-  }
-  // We sum square of intensity difference with factor 10, but because Intensity
-  // returns 100 times real intensity we need to multiply differences of colors
-  // by 1000.
-  distance *= 1000u;
-  distance += Square(Intensity(predicted_value)
-                     - Intensity(actual_value));
-  return distance;
-}
-
-static uint32_t Predict(int x, int y, uint32_t* image) {
-  const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
-  const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
-  const uint32_t p =
-      (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
-      (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
-      (((((t >>  8) & 0xff) + ((l >>  8) & 0xff)) / 2) <<  8) +
-      (((((t >>  0) & 0xff) + ((l >>  0) & 0xff)) / 2) <<  0);
-  if (x == 0 && y == 0) return ARGB_BLACK;
-  if (x == 0) return t;
-  if (y == 0) return l;
-  return p;
-}
-
-static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
-    int a, int b, int c) {
-  return Clip255(a + ((b - c) >> 2));
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
-    uint32_t c0, uint32_t c1, uint32_t c2) {
-  const int a = AddSubtractComponentFullWithCoefficient(
-      c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
-                                                       (c1 >> 16) & 0xff,
-                                                       (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
-                                                       (c1 >> 8) & 0xff,
-                                                       (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFullWithCoefficient(
-      c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-//------------------------------------------------------------------------------
-
-// Find palette entry with minimum error from difference of actual pixel value
-// and predicted pixel value. Propagate error of pixel to its top and left pixel
-// in src array. Write predicted_value + palette_entry to new_image. Return
-// index of best palette entry.
-static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
-                                const uint32_t palette[], int palette_size) {
-  int i;
-  int idx = 0;
-  uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
-  for (i = 1; i < palette_size; ++i) {
-    const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
-    if (distance < best_distance) {
-      best_distance = distance;
-      idx = i;
-    }
-  }
-  return idx;
-}
-
-static void ApplyBestPaletteEntry(int x, int y,
-                                  uint32_t new_value, uint32_t palette_value,
-                                  uint32_t* src, int src_stride,
-                                  uint32_t* new_image) {
-  AddPixelsEq(&new_value, palette_value);
-  if (x > 0) {
-    src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
-                                                       new_value, src[x]);
-  }
-  if (y > 0) {
-    src[x - src_stride] =
-        ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
-                                              new_value, src[x]);
-  }
-  new_image[x] = new_value;
-}
-
-//------------------------------------------------------------------------------
-// Main entry point
-
-static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
-                                           uint32_t src_stride,
-                                           uint32_t dst_stride,
-                                           const uint32_t* palette,
-                                           int palette_size,
-                                           int width, int height,
-                                           int num_passes) {
-  int x, y;
-  WebPEncodingError err = VP8_ENC_OK;
-  uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
-  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  if (new_image == NULL || tmp_row == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-
-  while (num_passes--) {
-    uint32_t* cur_src = src;
-    uint32_t* cur_dst = dst;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t predicted_value = Predict(x, y, new_image);
-        tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
-                                          palette, palette_size);
-        ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
-                              cur_src, src_stride, new_image);
-      }
-      for (x = 0; x < width; ++x) {
-        cur_dst[x] = palette[tmp_row[x]];
-      }
-      cur_src += src_stride;
-      cur_dst += dst_stride;
-    }
-  }
- Error:
-  WebPSafeFree(new_image);
-  WebPSafeFree(tmp_row);
-  return err;
-}
-
-// replaces enc->argb_ by a palettizable approximation of it,
-// and generates optimal enc->palette_[]
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst = enc->argb_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  WebPEncodingError err = VP8_ENC_OK;
-  memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
-  enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
-  enc->palette_size_ = DELTA_PALETTE_SIZE;
-  err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
-                          enc->palette_, enc->palette_size_,
-                          width, height, 2);
-  if (err != VP8_ENC_OK) goto Error;
-
- Error:
-  return err;
-}
-
-#else  // !WEBP_EXPERIMENTAL_FEATURES
-
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  (void)enc;
-  return VP8_ENC_ERROR_INVALID_CONFIGURATION;
-}
-
-#endif  // WEBP_EXPERIMENTAL_FEATURES
--- a/3rdparty/libwebp/src/enc/delta_palettization_enc.h
+++ b/3rdparty/libwebp/src/enc/delta_palettization_enc.h
@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-
-#include "src/webp/encode.h"
-#include "src/enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
--- a/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/3rdparty/libwebp/src/enc/frame_enc.c
@ -198,7 +198,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {

  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
+    ++p[mb->segment_];
  }
 #if !defined(WEBP_DISABLE_STATS)
  if (enc->pic_->stats != NULL) {
@ -520,6 +520,14 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }

+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
 #else  // defined(WEBP_DISABLE_STATS)
 static void ResetSSE(VP8Encoder* const enc) {
  (void)enc;
@ -528,10 +536,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
  VP8Encoder* const enc = it->enc_;
  WebPPicture* const pic = enc->pic_;
  if (pic->extra_info != NULL) {
-    memset(pic->extra_info, 0,
-           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
  }
 }
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
 #endif  // !defined(WEBP_DISABLE_STATS)

 static double GetPSNR(uint64_t mse, uint64_t size) {
@ -570,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
    VP8IteratorImport(&it, NULL);
    if (VP8Decimate(&it, &info, rd_opt)) {
      // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+      ++enc->proba_.nb_skip_;
    }
    RecordResiduals(&it, &info);
    size += info.R + info.H;
@ -841,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
      ++num_pass_left;
      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
      continue;                        // ...and start over
    }
    if (is_last_pass) {
@ -871,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 #endif    // DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
-
--- a/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/3rdparty/libwebp/src/enc/histogram_enc.c
@ -200,14 +200,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
  }
 }

-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol) {
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
  VP8LBitEntropy entropy;
  VP8LBitsEntropyUnrefined(array, n, &entropy);
-  if (trivial_symbol != NULL) {
-    *trivial_symbol =
-        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
-  }

  return BitsEntropyRefine(&entropy);
 }
@ -1031,7 +1026,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
    }
  }

-  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
+  // TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
  // Find the optimal map from original histograms to the final ones.
  HistogramRemap(orig_histo, image_histo, histogram_symbols);

--- a/3rdparty/libwebp/src/enc/histogram_enc.h
+++ b/3rdparty/libwebp/src/enc/histogram_enc.h
@ -109,10 +109,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             uint16_t* const histogram_symbols);

 // Returns the entropy for the symbols in the input array.
-// Also sets trivial_symbol to the code value, if the array has only one code
-// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol);
+double VP8LBitsEntropy(const uint32_t* const array, int n);

 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
--- a/3rdparty/libwebp/src/enc/iterator_enc.c
+++ b/3rdparty/libwebp/src/enc/iterator_enc.c
@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
  memset(it->u_left_, 129, 8);
  memset(it->v_left_, 129, 8);
  it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
 }

 static void InitTop(VP8EncIterator* const it) {
@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
  const size_t top_size = enc->mb_w_ * 16;
  memset(enc->y_top_, 127, 2 * top_size);
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
 }

 void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
  it->u_left_ = it->y_left_ + 16 + 16;
  it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
  VP8IteratorReset(it);
 }

@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 }

 //------------------------------------------------------------------------------
-
--- a/3rdparty/libwebp/src/enc/near_lossless_enc.c
+++ b/3rdparty/libwebp/src/enc/near_lossless_enc.c
@ -146,6 +146,6 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,

 // Define a stub to suppress compiler warnings.
 extern void VP8LNearLosslessStub(void);
-WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
+void VP8LNearLosslessStub(void) {}

 #endif  // (WEBP_NEAR_LOSSLESS == 1)
--- a/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_csp_enc.c
@ -28,11 +28,11 @@
 // If defined, use table to compute x / alpha.
 #define USE_INVERSE_ALPHA_TABLE

-static const union {
-  uint32_t argb;
-  uint8_t  bytes[4];
-} test_endian = { 0xff000000u };
-#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+#ifdef WORDS_BIGENDIAN
+#define ALPHA_OFFSET 0   // uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#else
+#define ALPHA_OFFSET 3   // uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#endif

 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
@ -61,7 +61,7 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
    return CheckNonOpaque(picture->a, picture->width, picture->height,
                          1, picture->a_stride);
  } else {
-    const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
+    const int alpha_offset = ALPHA_OFFSET;
    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
                          picture->width, picture->height,
                          4, picture->argb_stride * sizeof(*picture->argb));
@ -126,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {

 #else

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
+static void InitGammaTables(void) {}
 static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
  return (int)(base_value << shift);
@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W

 #if defined(USE_GAMMA_COMPRESSION)

-// float variant of gamma-correction
 // We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
-static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
-static float kLinearToGammaTabF[kGammaTabSize + 2];
-static volatile int kGammaTablesFOk = 0;
-
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
-  if (!kGammaTablesFOk) {
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
    int v;
    const double norm = 1. / MAX_Y_T;
    const double scale = 1. / kGammaTabSize;
    const double a = 0.09929682680944;
    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
    for (v = 0; v <= MAX_Y_T; ++v) {
      const double g = norm * v;
+      double value;
      if (g <= thresh * 4.5) {
-        kGammaToLinearTabF[v] = (float)(g / 4.5);
+        value = g / 4.5;
      } else {
        const double a_rec = 1. / (1. + a);
-        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+        value = pow(a_rec * (g + a), kGammaF);
      }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
    }
    for (v = 0; v <= kGammaTabSize; ++v) {
      const double g = scale * v;
@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
      } else {
        value = (1. + a) * pow(g, 1. / kGammaF) - a;
      }
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
    }
    // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
-    kGammaTablesFOk = 1;
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
  }
 }

-static WEBP_INLINE float GammaToLinearF(int v) {
-  return kGammaToLinearTabF[v];
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
 }

-static WEBP_INLINE int LinearToGammaF(float value) {
-  const float v = value * kGammaTabSize;
-  const int tab_pos = (int)v;
-  const float x = v - (float)tab_pos;      // fractional part
-  const float v0 = kLinearToGammaTabF[tab_pos + 0];
-  const float v1 = kLinearToGammaTabF[tab_pos + 1];
-  const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return (int)(y + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
 }

 #else

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
-static WEBP_INLINE float GammaToLinearF(int v) {
-  const float norm = 1.f / MAX_Y_T;
-  return norm * v;
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
 }
-static WEBP_INLINE int LinearToGammaF(float value) {
-  return (int)(MAX_Y_T * value + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
 }

 #endif    // USE_GAMMA_COMPRESSION
@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) {
  return (luma >> YUV_FIX);
 }

-static float RGBToGrayF(float r, float g, float b) {
-  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
-}
-
-static int ScaleDown(int a, int b, int c, int d) {
-  const float A = GammaToLinearF(a);
-  const float B = GammaToLinearF(b);
-  const float C = GammaToLinearF(c);
-  const float D = GammaToLinearF(d);
-  return LinearToGammaF(0.25f * (A + B + C + D));
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
 }

 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
  int i;
  for (i = 0; i < w; ++i) {
-    const float R = GammaToLinearF(src[0 * w + i]);
-    const float G = GammaToLinearF(src[1 * w + i]);
-    const float B = GammaToLinearF(src[2 * w + i]);
-    const float Y = RGBToGrayF(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaF(Y);
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
  }
 }

@ -863,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
  }

  if (use_iterative_conversion) {
-    InitGammaTablesF();
+    InitGammaTablesS();
    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
      return 0;
    }
@ -990,10 +997,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
  } else {
    const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
-    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
-    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
-    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+    const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET);
+    const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET);
+    const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET);
+    const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET);

    picture->colorspace = WEBP_YUV420;
    return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
@ -1044,7 +1051,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
    const int argb_stride = 4 * picture->argb_stride;
    uint8_t* dst = (uint8_t*)picture->argb;
    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
-    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+    WebPUpsampleLinePairFunc upsample =
+        WebPGetLinePairConverter(ALPHA_OFFSET > 0);

    // First row, with replicated top samples.
    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
@ -1087,6 +1095,7 @@ static int Import(WebPPicture* const picture,
                  const uint8_t* rgb, int rgb_stride,
                  int step, int swap_rb, int import_alpha) {
  int y;
+  // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
  const uint8_t* g_ptr = rgb + 1;
  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
@ -1104,19 +1113,32 @@ static int Import(WebPPicture* const picture,
  WebPInitAlphaProcessing();

  if (import_alpha) {
+    // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
    uint32_t* dst = picture->argb;
-    const int do_copy =
-        (!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
+    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
    assert(step == 4);
-    for (y = 0; y < height; ++y) {
-      if (do_copy) {
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
        memcpy(dst, rgb, width * 4);
-      } else {
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+        // BGRA or RGBA input order.
+        const uint8_t* a_ptr = rgb + 3;
+        WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+        r_ptr += rgb_stride;
+        g_ptr += rgb_stride;
+        b_ptr += rgb_stride;
+#else
        // RGBA input order. Need to swap R and B.
        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
      }
-      rgb += rgb_stride;
-      dst += picture->argb_stride;
    }
  } else {
    uint32_t* dst = picture->argb;
--- a/3rdparty/libwebp/src/enc/picture_psnr_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_psnr_enc.c
@ -18,6 +18,7 @@
 #include <math.h>
 #include <stdlib.h>

+#include "src/dsp/dsp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"

@ -169,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
  return 1;
 }

+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                          int type, float results[5]) {
  int w, h, c;
@ -195,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
    float distortion;
    const size_t stride0 = 4 * (size_t)p0.argb_stride;
    const size_t stride1 = 4 * (size_t)p1.argb_stride;
-    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
-                             (const uint8_t*)p1.argb + c, stride1,
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
                             w, h, 4, type, &distortion, results + c)) {
      goto Error;
    }
@ -214,6 +223,8 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
  return ok;
 }

+#undef BLUE_OFFSET
+
 #else  // defined(WEBP_DISABLE_STATS)
 int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
                        const uint8_t* ref, size_t ref_stride,
--- a/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/3rdparty/libwebp/src/enc/quant_enc.c
@ -826,6 +826,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
  return nz;
 }

+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                         uint8_t* const yuv_out, int mode) {
  const VP8Encoder* const enc = it->enc_;
@ -839,6 +918,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
  for (n = 0; n < 8; n += 2) {
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
  }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
  if (DO_TRELLIS_UV && it->do_trellis_) {
    int ch, x, y;
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
@ -1101,6 +1182,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
      CopyScore(&rd_best, &rd_uv);
      rd->mode_uv = mode;
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
      SwapPtr(&dst, &tmp_dst);
    }
  }
@ -1109,6 +1193,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  if (dst != dst0) {   // copy 16x8 block if needed
    VP8Copy16x8(dst, dst0);
  }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }

 //------------------------------------------------------------------------------
--- a/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8i_enc.h
@ -30,9 +30,9 @@ extern "C" {
 // Various defines and enums

 // version numbers
-#define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 6
-#define ENC_REV_VERSION 1
+#define ENC_MAJ_VERSION 1
+#define ENC_MIN_VERSION 0
+#define ENC_REV_VERSION 0

 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
       MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER

+// quality below which error-diffusion is enabled
+#define ERROR_DIFFUSION_QUALITY 98
+
 //------------------------------------------------------------------------------
 // Headers

@ -201,6 +204,8 @@ typedef struct {
  score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;

+typedef int8_t DError[2 /* u/v */][2 /* top or left */];
+
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
@ -213,6 +218,7 @@ typedef struct {
  uint8_t modes_i4[16];       // mode numbers for intra4 predictions
  int mode_uv;                // mode number of chroma prediction
  uint32_t nz;                // non-zero blocks
+  int8_t derr[2][3];          // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;

 // Iterator structure to iterate through macroblocks, pointing to the
@ -242,6 +248,9 @@ typedef struct {
  int           count_down0_;      // starting counter value (for progress)
  int           percent0_;         // saved initial progress percent

+  DError        left_derr_;        // left error diffusion (u/v)
+  DError       *top_derr_;         // top diffusion error - NULL if disabled
+
  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
@ -401,6 +410,7 @@ struct VP8Encoder {
  uint8_t*   uv_top_;    // top u/v samples.
                         // U and V are packed into 16 bytes (8 U + 8 V)
  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  DError*    top_derr_;  // diffusion error (NULL if disabled)
 };

 //------------------------------------------------------------------------------
--- a/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/3rdparty/libwebp/src/enc/vp8l_enc.c
@ -26,8 +26,6 @@
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"

-#include "src/enc/delta_palettization_enc.h"
-
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600

@ -259,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
      ++histo[kHistoAlphaPred * 256];

      for (j = 0; j < kHistoTotal; ++j) {
-        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
      }
      entropy[kDirect] = entropy_comp[kHistoAlpha] +
          entropy_comp[kHistoRed] +
@ -384,8 +382,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
      AnalyzeAndCreatePalette(pic, low_effort,
                              enc->palette_, &enc->palette_size_);

-  // TODO(jyrki): replace the decision to be based on an actual estimate
-  // of entropy, or even spatial variance of entropy.
+  // Empirical bit sizes.
  enc->histo_bits_ = GetHistoBits(method, use_palette,
                                  pic->width, pic->height);
  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
@ -756,7 +753,6 @@ static WebPEncodingError StoreImageToBitMask(
      // Don't write the distance with the extra bits code since
      // the distance can be up to 18 bits of extra bits, and the prefix
      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
-      // TODO(jyrki): optimize this further.
      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes + 4, code);
      VP8LPutBits(bw, bits, n_bits);
@ -1464,49 +1460,6 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                              20 /* quality */, low_effort);
 }

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
-    int low_effort) {
-  const WebPPicture* const pic = enc->pic_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  const int pred_bits = 5;
-  const int transform_width = VP8LSubSampleSize(width, pred_bits);
-  const int transform_height = VP8LSubSampleSize(height, pred_bits);
-  const int pred = 7;   // default is Predictor7 (Top/Left Average)
-  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
-  uint32_t* predictors;
-  int tile_x, tile_y;
-  WebPEncodingError err = VP8_ENC_OK;
-
-  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
-                                         sizeof(*predictors));
-  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
-    }
-  }
-
-  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
-  VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(
-      bw, predictors, &enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1],
-      transform_width, transform_height, quality, low_effort);
-  WebPSafeFree(predictors);
-  return err;
-}
-
-#endif // WEBP_EXPERIMENTAL_FEATURES
-
 // -----------------------------------------------------------------------------
 // VP8LEncoder

@ -1568,7 +1521,7 @@ static int EncodeStreamHook(void* input, void* data2) {
  WebPEncodingError err = VP8_ENC_OK;
  const int quality = (int)config->quality;
  const int low_effort = (config->method == 0);
-#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
+#if (WEBP_NEAR_LOSSLESS == 1)
  const int width = picture->width;
 #endif
  const int height = picture->height;
@ -1627,29 +1580,6 @@ static int EncodeStreamHook(void* input, void* data2) {
    enc->argb_content_ = kEncoderNone;
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (config->use_delta_palette) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 0;
-      enc->use_subtract_green_ = 0;
-      enc->use_palette_ = 1;
-      if (enc->argb_content_ != kEncoderNearLossless &&
-          enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
-      }
-      err = WebPSearchOptimalDeltaPalette(enc);
-      if (err != VP8_ENC_OK) goto Error;
-      if (enc->use_palette_) {
-        err = AllocateTransformBuffer(enc, width, height);
-        if (err != VP8_ENC_OK) goto Error;
-        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
-        if (err != VP8_ENC_OK) goto Error;
-        use_delta_palette = 1;
-      }
-    }
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
    // Encode palette
    if (enc->use_palette_) {
      err = EncodePalette(bw, low_effort, enc);
@ -1822,7 +1752,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
      worker_interface->Init(worker);
      worker->data1 = param;
      worker->data2 = NULL;
-      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+      worker->hook = EncodeStreamHook;
    }
  }

@ -1944,7 +1874,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
  if (err != VP8_ENC_OK) goto Error;

-  // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;

  // Finish the RIFF chunk.
--- a/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/3rdparty/libwebp/src/enc/webp_enc.c
@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
      + WEBP_ALIGN_CST;                      // align all
  const size_t lf_stats_size =
      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
  uint8_t* mem;
  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
                      + WEBP_ALIGN_CST           // cache alignment
                      + info_size                // modes info
                      + preds_size               // prediction modes
                      + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
                      + nz_size                  // coeff context bits
                      + lf_stats_size;           // autofilter stats

@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
-         preds_size, samples_size, nz_size, lf_stats_size, size);
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
  printf("Transient object sizes:\n"
         "      VP8EncIterator: %ld\n"
         "        VP8ModeScore: %ld\n"
@ -219,6 +224,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->y_top_ = mem;
  enc->uv_top_ = enc->y_top_ + top_stride;
  mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
  assert(mem <= (uint8_t*)enc + size);

  enc->config_ = config;
--- a/3rdparty/libwebp/src/mux/muxi.h
+++ b/3rdparty/libwebp/src/mux/muxi.h
@ -26,9 +26,9 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Defines and constants.

-#define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 4
-#define MUX_REV_VERSION 1
+#define MUX_MAJ_VERSION 1
+#define MUX_MIN_VERSION 0
+#define MUX_REV_VERSION 0

 // Chunk object.
 typedef struct WebPChunk WebPChunk;
--- a/3rdparty/libwebp/src/utils/endian_inl_utils.h
+++ b/3rdparty/libwebp/src/utils/endian_inl_utils.h
@ -19,13 +19,6 @@
 #include "src/dsp/dsp.h"
 #include "src/webp/types.h"

-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
 #if defined(WORDS_BIGENDIAN)
 #define HToLE32 BSwap32
 #define HToLE16 BSwap16
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@ -3,9 +3,22 @@ add_definitions(-D__OPENCV_APPS=1)

 link_libraries(${OPENCV_LINKER_LIBS})

-add_subdirectory(traincascade)
-add_subdirectory(createsamples)
-add_subdirectory(annotation)
-add_subdirectory(visualisation)
-add_subdirectory(interactive-calibration)
-add_subdirectory(version)
+macro(ocv_add_app directory)
+  if(DEFINED BUILD_APPS_LIST)
+    list(FIND BUILD_APPS_LIST ${directory} _index)
+    if (${_index} GREATER -1)
+      add_subdirectory(${directory})
+    else()
+      message(STATUS "Skip OpenCV app: ${directory}")
+    endif()
+  else()
+    add_subdirectory(${directory})
+  endif()
+endmacro()
+
+ocv_add_app(traincascade)
+ocv_add_app(createsamples)
+ocv_add_app(annotation)
+ocv_add_app(visualisation)
+ocv_add_app(interactive-calibration)
+ocv_add_app(version)
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@ -217,7 +217,7 @@ int main(int argc, char** argv)
                (*it)->resetState();
        }
    }
-    catch (std::runtime_error exp) {
+    catch (const std::runtime_error& exp) {
        std::cout << exp.what() << std::endl;
    }

--- a/apps/version/CMakeLists.txt
+++ b/apps/version/CMakeLists.txt
@ -1,19 +1,13 @@
-SET(OPENCV_APPLICATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio)
+set(OPENCV_APPLICATION_DEPS opencv_core)
 ocv_check_dependencies(${OPENCV_APPLICATION_DEPS})
-
 if(NOT OCV_DEPENDENCIES_FOUND)
  return()
 endif()

 project(opencv_version)
 set(the_target opencv_version)
-
-ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
 ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
-
-file(GLOB SRCS *.cpp)
-
-ocv_add_executable(${the_target} ${SRCS})
+ocv_add_executable(${the_target} opencv_version.cpp)
 ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})

 set_target_properties(${the_target} PROPERTIES
@ -30,3 +24,26 @@ if(INSTALL_CREATE_DISTRIB)
 else()
  install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
 endif()
+
+if(WIN32)
+  project(opencv_version_win32)
+  set(the_target opencv_version_win32)
+  ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
+  ocv_add_executable(${the_target} opencv_version.cpp)
+  ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})
+  target_compile_definitions(${the_target} PRIVATE "OPENCV_WIN32_API=1")
+  set_target_properties(${the_target} PROPERTIES
+                        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+                        RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
+                        OUTPUT_NAME "opencv_version_win32")
+
+  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
+
+  if(INSTALL_CREATE_DISTRIB)
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs)
+    endif()
+  else()
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
+  endif()
+endif()
--- a/apps/version/opencv_version.cpp
+++ b/apps/version/opencv_version.cpp
@ -9,6 +9,31 @@

 #include <opencv2/core/opencl/opencl_info.hpp>

+#ifdef OPENCV_WIN32_API
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+static void dumpHWFeatures(bool showAll = false)
+{
+    std::cout << "OpenCV's HW features list:" << std::endl;
+    int count = 0;
+    for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
+    {
+        cv::String name = cv::getHardwareFeatureName(i);
+        if (name.empty())
+            continue;
+        bool enabled = cv::checkHardwareSupport(i);
+        if (enabled)
+            count++;
+        if (enabled || showAll)
+        {
+            printf("    ID=%3d (%s) -> %s\n", i, name.c_str(), enabled ? "ON" : "N/A");
+        }
+    }
+    std::cout << "Total available: " << count << std::endl;
+}
+
 int main(int argc, const char** argv)
 {
    CV_TRACE_FUNCTION();
@ -16,6 +41,7 @@ int main(int argc, const char** argv)
    CV_TRACE_ARG_VALUE(argv0, "argv0", argv[0]);
    CV_TRACE_ARG_VALUE(argv1, "argv1", argv[1]);

+#ifndef OPENCV_WIN32_API
    cv::CommandLineParser parser(argc, argv,
        "{ help h usage ? |      | show this help message }"
        "{ verbose v      |      | show build configuration log }"
@ -45,24 +71,14 @@ int main(int argc, const char** argv)

    if (parser.has("hw"))
    {
-        bool showAll = parser.get<bool>("hw");
-        std::cout << "OpenCV's HW features list:" << std::endl;
-        int count = 0;
-        for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
-        {
-            cv::String name = cv::getHardwareFeatureName(i);
-            if (name.empty())
-                continue;
-            bool enabled = cv::checkHardwareSupport(i);
-            if (enabled)
-                count++;
-            if (enabled || showAll)
-            {
-                printf("    ID=%3d (%s) -> %s\n", i, name.c_str(), enabled ? "ON" : "N/A");
-            }
-        }
-        std::cout << "Total available: " << count << std::endl;
+        dumpHWFeatures(parser.get<bool>("hw"));
    }
+#else
+    std::cout << cv::getBuildInformation().c_str() << std::endl;
+    cv::dumpOpenCLInformation();
+    dumpHWFeatures();
+    MessageBoxA(NULL, "Check console window output", "OpenCV(" CV_VERSION ")", MB_ICONINFORMATION | MB_OK);
+#endif

    return 0;
 }
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@ -27,6 +27,12 @@ function(find_python preferred_version min_version library_env include_dir_env
         debug_library include_path include_dir include_dir2 packages_path
         numpy_include_dirs numpy_version)
 if(NOT ${found})
+  if(" ${executable}" STREQUAL " PYTHON_EXECUTABLE")
+    set(__update_python_vars 0)
+  else()
+    set(__update_python_vars 1)
+  endif()
+
  ocv_check_environment_variables(${executable})
  if(${executable})
    set(PYTHON_EXECUTABLE "${${executable}}")
@ -47,7 +53,7 @@ if(NOT ${found})
    endforeach()
  endif()

-  string(REGEX MATCH "^[0-9]+" _preferred_version_major ${preferred_version})
+  string(REGEX MATCH "^[0-9]+" _preferred_version_major "${preferred_version}")

  find_host_package(PythonInterp "${preferred_version}")
  if(NOT PYTHONINTERP_FOUND)
@ -56,7 +62,7 @@ if(NOT ${found})

  if(PYTHONINTERP_FOUND)
    # Check if python major version is correct
-    if(${_preferred_version_major} EQUAL ${PYTHON_VERSION_MAJOR})
+    if("${_preferred_version_major}" STREQUAL "" OR "${_preferred_version_major}" STREQUAL "${PYTHON_VERSION_MAJOR}")
      # Copy outputs
      set(_found ${PYTHONINTERP_FOUND})
      set(_executable ${PYTHON_EXECUTABLE})
@ -65,7 +71,9 @@ if(NOT ${found})
      set(_version_minor ${PYTHON_VERSION_MINOR})
      set(_version_patch ${PYTHON_VERSION_PATCH})
    endif()
+  endif()

+  if(__update_python_vars)
    # Clear find_host_package side effects
    unset(PYTHONINTERP_FOUND)
    unset(PYTHON_EXECUTABLE CACHE)
@ -109,7 +117,8 @@ if(NOT ${found})
        set(_library_release ${PYTHON_LIBRARY_RELEASE})
        set(_include_dir ${PYTHON_INCLUDE_DIR})
        set(_include_dir2 ${PYTHON_INCLUDE_DIR2})
-
+      endif()
+      if(__update_python_vars)
        # Clear find_package side effects
        unset(PYTHONLIBS_FOUND)
        unset(PYTHON_LIBRARIES)
@ -160,7 +169,7 @@ if(NOT ${found})
        unset(_path)
      endif()

-      set(_numpy_include_dirs ${${numpy_include_dirs}})
+      set(_numpy_include_dirs "${${numpy_include_dirs}}")

      if(NOT _numpy_include_dirs)
        if(CMAKE_CROSSCOMPILING)
@ -222,6 +231,10 @@ if(NOT ${found})
 endif()
 endfunction(find_python)

+if(OPENCV_PYTHON_SKIP_DETECTION)
+  return()
+endif()
+
 find_python(2.7 "${MIN_VER_PYTHON2}" PYTHON2_LIBRARY PYTHON2_INCLUDE_DIR
    PYTHON2INTERP_FOUND PYTHON2_EXECUTABLE PYTHON2_VERSION_STRING
    PYTHON2_VERSION_MAJOR PYTHON2_VERSION_MINOR PYTHON2LIBS_FOUND
--- a/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
+++ b/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
@ -4,32 +4,34 @@ Camera Calibration {#tutorial_py_calibration}
 Goal
 ----

-In this section,
-    -   We will learn about distortions in camera, intrinsic and extrinsic parameters of camera etc.
-    -   We will learn to find these parameters, undistort images etc.
+In this section, we will learn about
+
+* types of distortion caused by cameras
+* how to find the intrinsic and extrinsic properties of a camera
+* how to undistort images based off these properties

 Basics
 ------

-Today's cheap pinhole cameras introduces a lot of distortion to images. Two major distortions are
+Some pinhole cameras introduce significant distortion to images. Two major kinds of distortion are
 radial distortion and tangential distortion.

-Due to radial distortion, straight lines will appear curved. Its effect is more as we move away from
-the center of image. For example, one image is shown below, where two edges of a chess board are
-marked with red lines. But you can see that border is not a straight line and doesn't match with the
+Radial distortion causes straight lines to appear curved. Radial distortion becomes larger the farther points are from
+the center of the image. For example, one image is shown below in which two edges of a chess board are
+marked with red lines. But, you can see that the border of the chess board is not a straight line and doesn't match with the
 red line. All the expected straight lines are bulged out. Visit [Distortion
 (optics)](http://en.wikipedia.org/wiki/Distortion_%28optics%29) for more details.

 ![image](images/calib_radial.jpg)

-This distortion is represented as follows:
+Radial distortion can be represented as follows:

 \f[x_{distorted} = x( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6) \\
 y_{distorted} = y( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6)\f]

-Similarly, another distortion is the tangential distortion which occurs because image taking lense
-is not aligned perfectly parallel to the imaging plane. So some areas in image may look nearer than
-expected. It is represented as below:
+Similarly, tangential distortion occurs because the image-taking lense
+is not aligned perfectly parallel to the imaging plane. So, some areas in the image may look nearer than
+expected. The amount of tangential distortion can be represented as below:

 \f[x_{distorted} = x + [ 2p_1xy + p_2(r^2+2x^2)] \\
 y_{distorted} = y + [ p_1(r^2+ 2y^2)+ 2p_2xy]\f]
@ -38,10 +40,9 @@ In short, we need to find five parameters, known as distortion coefficients give

 \f[Distortion \; coefficients=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]

-In addition to this, we need to find a few more information, like intrinsic and extrinsic parameters
-of a camera. Intrinsic parameters are specific to a camera. It includes information like focal
-length (\f$f_x,f_y\f$), optical centers (\f$c_x, c_y\f$) etc. It is also called camera matrix. It depends on
-the camera only, so once calculated, it can be stored for future purposes. It is expressed as a 3x3
+In addition to this, we need to some other information, like the intrinsic and extrinsic parameters
+of the camera. Intrinsic parameters are specific to a camera. They include information like focal
+length (\f$f_x,f_y\f$) and optical centers (\f$c_x, c_y\f$). The focal length and optical centers can be used to create a camera matrix, which can be used to remove distortion due to the lenses of a specific camera.  The camera matrix is unique to a specific camera, so once calculated, it can be reused on other images taken by the same camera. It is expressed as a 3x3
 matrix:

 \f[camera \; matrix = \left [ \begin{matrix}   f_x & 0 & c_x \\  0 & f_y & c_y \\   0 & 0 & 1 \end{matrix} \right ]\f]
@ -49,20 +50,16 @@ matrix:
 Extrinsic parameters corresponds to rotation and translation vectors which translates a coordinates
 of a 3D point to a coordinate system.

-For stereo applications, these distortions need to be corrected first. To find all these parameters,
-what we have to do is to provide some sample images of a well defined pattern (eg, chess board). We
-find some specific points in it ( square corners in chess board). We know its coordinates in real
-world space and we know its coordinates in image. With these data, some mathematical problem is
-solved in background to get the distortion coefficients. That is the summary of the whole story. For
-better results, we need atleast 10 test patterns.
+For stereo applications, these distortions need to be corrected first. To find these parameters,
+we must provide some sample images of a well defined pattern (e.g. a chess board). We
+find some specific points of which we already know the relative positions (e.g. square corners in the chess board). We know the coordinates of these points in real world space and we know the coordinates in the image, so we can solve for the distortion coefficients. For better results, we need at least 10 test patterns.

 Code
 ----

-As mentioned above, we need atleast 10 test patterns for camera calibration. OpenCV comes with some
-images of chess board (see samples/cpp/left01.jpg -- left14.jpg), so we will utilize it. For sake of
-understanding, consider just one image of a chess board. Important input datas needed for camera
-calibration is a set of 3D real world points and its corresponding 2D image points. 2D image points
+As mentioned above, we need at least 10 test patterns for camera calibration. OpenCV comes with some
+images of a chess board (see samples/data/left01.jpg -- left14.jpg), so we will utilize these. Consider an image of a chess board. The important input data needed for calibration of the camera
+is the set of 3D real world points and the corresponding 2D coordinates of these points in the image. 2D image points
 are OK which we can easily find from the image. (These image points are locations where two black
 squares touch each other in chess boards)

@ -72,7 +69,7 @@ values. But for simplicity, we can say chess board was kept stationary at XY pla
 and camera was moved accordingly. This consideration helps us to find only X,Y values. Now for X,Y
 values, we can simply pass the points as (0,0), (1,0), (2,0), ... which denotes the location of
 points. In this case, the results we get will be in the scale of size of chess board square. But if
-we know the square size, (say 30 mm), and we can pass the values as (0,0),(30,0),(60,0),..., we get
+we know the square size, (say 30 mm), we can pass the values as (0,0), (30,0), (60,0), ... .  Thus, we get
 the results in mm. (In this case, we don't know square size since we didn't take those images, so we
 pass in terms of square size).

@ -80,23 +77,22 @@ pass in terms of square size).

 ### Setup

-So to find pattern in chess board, we use the function, **cv.findChessboardCorners()**. We also
-need to pass what kind of pattern we are looking, like 8x8 grid, 5x5 grid etc. In this example, we
+So to find pattern in chess board, we can use the function, **cv.findChessboardCorners()**. We also
+need to pass what kind of pattern we are looking for, like 8x8 grid, 5x5 grid etc. In this example, we
 use 7x6 grid. (Normally a chess board has 8x8 squares and 7x7 internal corners). It returns the
 corner points and retval which will be True if pattern is obtained. These corners will be placed in
 an order (from left-to-right, top-to-bottom)

-@sa This function may not be able to find the required pattern in all the images. So one good option
+@sa This function may not be able to find the required pattern in all the images. So, one good option
 is to write the code such that, it starts the camera and check each frame for required pattern. Once
-pattern is obtained, find the corners and store it in a list. Also provides some interval before
+the pattern is obtained, find the corners and store it in a list. Also, provide some interval before
 reading next frame so that we can adjust our chess board in different direction. Continue this
-process until required number of good patterns are obtained. Even in the example provided here, we
-are not sure out of 14 images given, how many are good. So we read all the images and take the good
+process until the required number of good patterns are obtained. Even in the example provided here, we
+are not sure how many images out of the 14 given are good.  Thus, we must read all the images and take only the good
 ones.

-@sa Instead of chess board, we can use some circular grid, but then use the function
-**cv.findCirclesGrid()** to find the pattern. It is said that less number of images are enough when
-using circular grid.
+@sa Instead of chess board, we can alternatively use a circular grid.  In this case, we must use the function
+**cv.findCirclesGrid()** to find the pattern. Fewer images are sufficient to perform camera calibration using a circular grid.

 Once we find the corners, we can increase their accuracy using **cv.cornerSubPix()**. We can also
 draw the pattern using **cv.drawChessboardCorners()**. All these steps are included in below code:
@ -146,22 +142,23 @@ One image with pattern drawn on it is shown below:

 ### Calibration

-So now we have our object points and image points we are ready to go for calibration. For that we
-use the function, **cv.calibrateCamera()**. It returns the camera matrix, distortion coefficients,
+Now that we have our object points and image points, we are ready to go for calibration. We can
+use the function, **cv.calibrateCamera()** which returns the camera matrix, distortion coefficients,
 rotation and translation vectors etc.
@code{.py}
 ret, mtx, dist, rvecs, tvecs = cv.calibrateCamera(objpoints, imgpoints, gray.shape[::-1], None, None)
@endcode
+
 ### Undistortion

-We have got what we were trying. Now we can take an image and undistort it. OpenCV comes with two
-methods, we will see both. But before that, we can refine the camera matrix based on a free scaling
+Now, we can take an image and undistort it. OpenCV comes with two
+methods for doing this. However first, we can refine the camera matrix based on a free scaling
 parameter using **cv.getOptimalNewCameraMatrix()**. If the scaling parameter alpha=0, it returns
 undistorted image with minimum unwanted pixels. So it may even remove some pixels at image corners.
-If alpha=1, all pixels are retained with some extra black images. It also returns an image ROI which
+If alpha=1, all pixels are retained with some extra black images. This function also returns an image ROI which
 can be used to crop the result.

-So we take a new image (left12.jpg in this case. That is the first image in this chapter)
+So, we take a new image (left12.jpg in this case. That is the first image in this chapter)
@code{.py}
 img = cv.imread('left12.jpg')
 h,  w = img.shape[:2]
@ -169,7 +166,7 @@ newcameramtx, roi = cv.getOptimalNewCameraMatrix(mtx, dist, (w,h), 1, (w,h))
@endcode
 #### 1. Using **cv.undistort()**

-This is the shortest path. Just call the function and use ROI obtained above to crop the result.
+This is the easiest way. Just call the function and use ROI obtained above to crop the result.
@code{.py}
 # undistort
 dst = cv.undistort(img, mtx, dist, None, newcameramtx)
@ -181,7 +178,7 @@ cv.imwrite('calibresult.png', dst)
@endcode
 #### 2. Using **remapping**

-This is curved path. First find a mapping function from distorted image to undistorted image. Then
+This way is a little bit more difficult. First, find a mapping function from the distorted image to the undistorted image. Then
 use the remap function.
@code{.py}
 # undistort
@ -193,23 +190,22 @@ x, y, w, h = roi
 dst = dst[y:y+h, x:x+w]
 cv.imwrite('calibresult.png', dst)
@endcode
-Both the methods give the same result. See the result below:
+Still, both the methods give the same result. See the result below:

 ![image](images/calib_result.jpg)

 You can see in the result that all the edges are straight.

-Now you can store the camera matrix and distortion coefficients using write functions in Numpy
+Now you can store the camera matrix and distortion coefficients using write functions in NumPy
 (np.savez, np.savetxt etc) for future uses.

 Re-projection Error
 -------------------

-Re-projection error gives a good estimation of just how exact is the found parameters. This should
-be as close to zero as possible. Given the intrinsic, distortion, rotation and translation matrices,
-we first transform the object point to image point using **cv.projectPoints()**. Then we calculate
+Re-projection error gives a good estimation of just how exact the found parameters are. The closer the re-projection error is to zero, the more accurate the parameters we found are. Given the intrinsic, distortion, rotation and translation matrices,
+we must first transform the object point to image point using **cv.projectPoints()**. Then, we can calculate
 the absolute norm between what we got with our transformation and the corner finding algorithm. To
-find the average error we calculate the arithmetical mean of the errors calculate for all the
+find the average error, we calculate the arithmetical mean of the errors calculated for all the
 calibration images.
@code{.py}
 mean_error = 0
--- a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
+++ b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
@ -126,9 +126,9 @@ Result looks like below:
 Additional Resources
 --------------------

-#  Video Lecture on [Face Detection and Tracking](http://www.youtube.com/watch?v=WfdYYNamHZ8)
-2.  An interesting interview regarding Face Detection by [Adam
-    Harvey](http://www.makematics.com/research/viola-jones/)
+-#  Video Lecture on [Face Detection and Tracking](https://www.youtube.com/watch?v=WfdYYNamHZ8)
+-#  An interesting interview regarding Face Detection by [Adam
+    Harvey](https://web.archive.org/web/20171204220159/http://www.makematics.com/research/viola-jones/)

 Exercises
 ---------
--- a/doc/py_tutorials/py_photo/py_hdr/images/ldr_debevec.jpg
+++ b/doc/py_tutorials/py_photo/py_hdr/images/ldr_debevec.jpg
--- a/doc/py_tutorials/py_photo/py_hdr/py_hdr.markdown
+++ b/doc/py_tutorials/py_photo/py_hdr/py_hdr.markdown
@ -27,7 +27,7 @@ merged, it has to be converted back to 8-bit to view it on usual displays. This
 tonemapping. Additional complexities arise when objects of the scene or camera move between shots,
 since images with different exposures should be registered and aligned.

-In this tutorial we show 2 algorithms (Debvec, Robertson) to generate and display HDR image from an
+In this tutorial we show 2 algorithms (Debevec, Robertson) to generate and display HDR image from an
 exposure sequence, and demonstrate an alternative approach called exposure fusion (Mertens), that
 produces low dynamic range image and does not need the exposure times data.
 Furthermore, we estimate the camera response function (CRF) which is of great value for many computer
@ -65,14 +65,14 @@ exposure_times = np.array([15.0, 2.5, 0.25, 0.0333], dtype=np.float32)
 ### 2. Merge exposures into HDR image

 In this stage we merge the exposure sequence into one HDR image, showing 2 possibilities
-which we have in OpenCV. The first method is Debvec and the second one is Robertson.
+which we have in OpenCV. The first method is Debevec and the second one is Robertson.
 Notice that the HDR image is of type float32, and not uint8, as it contains the
 full dynamic range of all exposure images.

@code{.py}
 # Merge exposures to HDR image
-merge_debvec = cv.createMergeDebevec()
-hdr_debvec = merge_debvec.process(img_list, times=exposure_times.copy())
+merge_debevec = cv.createMergeDebevec()
+hdr_debevec = merge_debevec.process(img_list, times=exposure_times.copy())
 merge_robertson = cv.createMergeRobertson()
 hdr_robertson = merge_robertson.process(img_list, times=exposure_times.copy())
@endcode
@ -86,7 +86,7 @@ we will later have to clip the data in order to avoid overflow.
@code{.py}
 # Tonemap HDR image
 tonemap1 = cv.createTonemapDurand(gamma=2.2)
-res_debvec = tonemap1.process(hdr_debvec.copy())
+res_debevec = tonemap1.process(hdr_debevec.copy())
 tonemap2 = cv.createTonemapDurand(gamma=1.3)
 res_robertson = tonemap2.process(hdr_robertson.copy())
@endcode
@ -111,11 +111,11 @@ integers in the range of [0..255].

@code{.py}
 # Convert datatype to 8-bit and save
-res_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')
+res_debevec_8bit = np.clip(res_debevec*255, 0, 255).astype('uint8')
 res_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')
 res_mertens_8bit = np.clip(res_mertens*255, 0, 255).astype('uint8')

-cv.imwrite("ldr_debvec.jpg", res_debvec_8bit)
+cv.imwrite("ldr_debevec.jpg", res_debevec_8bit)
 cv.imwrite("ldr_robertson.jpg", res_robertson_8bit)
 cv.imwrite("fusion_mertens.jpg", res_mertens_8bit)
@endcode
@ -127,9 +127,9 @@ You can see the different results but consider that each algorithm have addition
 extra parameters that you should fit to get your desired outcome. Best practice is
 to try the different methods and see which one performs best for your scene.

-### Debvec:
+### Debevec:

-![image](images/ldr_debvec.jpg)
+![image](images/ldr_debevec.jpg)

 ### Robertson:

@ -150,9 +150,9 @@ function and use it for the HDR merge.

@code{.py}
 # Estimate camera response function (CRF)
-cal_debvec = cv.createCalibrateDebevec()
-crf_debvec = cal_debvec.process(img_list, times=exposure_times)
-hdr_debvec = merge_debvec.process(img_list, times=exposure_times.copy(), response=crf_debvec.copy())
+cal_debevec = cv.createCalibrateDebevec()
+crf_debevec = cal_debevec.process(img_list, times=exposure_times)
+hdr_debevec = merge_debevec.process(img_list, times=exposure_times.copy(), response=crf_debevec.copy())
 cal_robertson = cv.createCalibrateRobertson()
 crf_robertson = cal_robertson.process(img_list, times=exposure_times)
 hdr_robertson = merge_robertson.process(img_list, times=exposure_times.copy(), response=crf_robertson.copy())
@ -166,12 +166,12 @@ For this sequence we got the following estimation:
 Additional Resources
 --------------------

-1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008.
-2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999.
-3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007.
+1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008. @cite DM97
+2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999. @cite RB99
+3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007. @cite MK07
 4. Images from [Wikipedia-HDR](https://en.wikipedia.org/wiki/High-dynamic-range_imaging)

 Exercises
 ---------
-1. Try all tonemap algorithms: [Drago](http://docs.opencv.org/master/da/d53/classcv_1_1TonemapDrago.html), [Durand](http://docs.opencv.org/master/da/d3d/classcv_1_1TonemapDurand.html), [Mantiuk](http://docs.opencv.org/master/de/d76/classcv_1_1TonemapMantiuk.html) and [Reinhard](http://docs.opencv.org/master/d0/dec/classcv_1_1TonemapReinhard.html).
-2. Try changing the parameters in the HDR calibration and tonemap methods.
+1. Try all tonemap algorithms: cv::TonemapDrago, cv::TonemapDurand, cv::TonemapMantiuk and cv::TonemapReinhard
+2. Try changing the parameters in the HDR calibration and tonemap methods.
--- a/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
@ -15,55 +15,167 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java)
+@include samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py)
+@include samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py
+@end_toggle

 Explanation
 -----------

 The main function is rather simple, as follows from the comments we do the following:
-#  Open the image, convert it into grayscale and blur it to get rid of the noise.
-    @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp setup
-# Create a window with header "Source" and display the source file in it.
-    @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp createWindow
-# Create a trackbar on the source_window and assign a callback function to it
+-   Open the image, convert it into grayscale and blur it to get rid of the noise.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp setup
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java setup
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py setup
+@end_toggle
+
+-  Create a window with header "Source" and display the source file in it.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp createWindow
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java createWindow
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py createWindow
+@end_toggle
+
+-  Create a trackbar on the `source_window` and assign a callback function to it.
   In general callback functions are used to react to some kind of signal, in our
   case it's trackbar's state change.
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp taskbar
-# Explicit one-time call of `thresh_callback` is necessary to display
+   Explicit one-time call of `thresh_callback` is necessary to display
   the "Contours" window simultaniously with the "Source" window.
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp callback00
-# Wait for user to close the windows.
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp waitForIt

+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp trackbar
+@end_toggle

-The callback function `thresh_callback` does all the interesting job.
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java trackbar
+@end_toggle

+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py trackbar
+@end_toggle

-# Writes to `threshold_output` the threshold of the grayscale picture (you can check out about thresholding @ref tutorial_threshold "here").
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp threshold
-# Finds contours and saves them to the vectors `contour` and `hierarchy`.
-    @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp findContours
-# For every found contour we now apply approximation to polygons
-   with accuracy +-3 and stating that the curve must me closed.
+The callback function does all the interesting job.

-   After that we find a bounding rect for every polygon and save it to `boundRect`.
+-  Use @ref cv::Canny to detect edges in the images.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp Canny
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java Canny
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py Canny
+@end_toggle
+
+-  Finds contours and saves them to the vectors `contour` and `hierarchy`.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp findContours
+@end_toggle

+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java findContours
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py findContours
+@end_toggle
+
+-  For every found contour we now apply approximation to polygons
+   with accuracy +-3 and stating that the curve must be closed.
+   After that we find a bounding rect for every polygon and save it to `boundRect`.
   At last we find a minimum enclosing circle for every polygon and
   save it to `center` and `radius` vectors.
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp allthework
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp allthework
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java allthework
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py allthework
+@end_toggle

 We found everything we need, all we have to do is to draw.

-# Create new Mat of unsigned 8-bit chars, filled with zeros.
+-  Create new Mat of unsigned 8-bit chars, filled with zeros.
   It will contain all the drawings we are going to make (rects and circles).
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp zeroMat
-# For every contour: pick a random color, draw the contour, the bounding rectangle and
-   the minimal enclosing circle with it,
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp forContour
-# Display the results: create a new window "Contours" and show everything we added to drawings on it.
-   @snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp showDrawings
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp zeroMat
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java zeroMat
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py zeroMat
+@end_toggle
+
+-  For every contour: pick a random color, draw the contour, the bounding rectangle and
+   the minimal enclosing circle with it.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp forContour
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java forContour
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py forContour
+@end_toggle
+
+-  Display the results: create a new window "Contours" and show everything we added to drawings on it.
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp showDrawings
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java showDrawings
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py showDrawings
+@end_toggle

 Result
 ------
--- a/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
@ -15,9 +15,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/GeneralContoursDemo2.java)
+@include samples/java/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/GeneralContoursDemo2.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py)
+@include samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py
+@end_toggle

 Explanation
 -----------
--- a/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
@ -15,9 +15,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/find_contours/FindContoursDemo.java)
+@include samples/java/tutorial_code/ShapeDescriptors/find_contours/FindContoursDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/find_contours/findContours_demo.py)
+@include samples/python/tutorial_code/ShapeDescriptors/find_contours/findContours_demo.py
+@end_toggle

 Explanation
 -----------
--- a/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
@ -14,10 +14,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp)
-
@include samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/hull/HullDemo.java)
+@include samples/java/tutorial_code/ShapeDescriptors/hull/HullDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/hull/hull_demo.py)
+@include samples/python/tutorial_code/ShapeDescriptors/hull/hull_demo.py
+@end_toggle

 Explanation
 -----------
--- a/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
@ -16,9 +16,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/moments/MomentsDemo.java)
+@include samples/java/tutorial_code/ShapeDescriptors/moments/MomentsDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/moments/moments_demo.py)
+@include samples/python/tutorial_code/ShapeDescriptors/moments/moments_demo.py
+@end_toggle

 Explanation
 -----------
--- a/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
+++ b/doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
@ -14,9 +14,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/point_polygon_test/PointPolygonTestDemo.java)
+@include samples/java/tutorial_code/ShapeDescriptors/point_polygon_test/PointPolygonTestDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/point_polygon_test/pointPolygonTest_demo.py)
+@include samples/python/tutorial_code/ShapeDescriptors/point_polygon_test/pointPolygonTest_demo.py
+@end_toggle

 Explanation
 -----------
--- a/doc/tutorials/imgproc/table_of_content_imgproc.markdown
+++ b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
@ -225,6 +225,8 @@ In this section you will learn about the image processing (manipulation) functio

 -   @subpage tutorial_find_contours

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
@ -233,6 +235,8 @@ In this section you will learn about the image processing (manipulation) functio

 -    @subpage tutorial_hull

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
@ -241,6 +245,8 @@ In this section you will learn about the image processing (manipulation) functio

 -   @subpage tutorial_bounding_rects_circles

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
@ -249,6 +255,8 @@ In this section you will learn about the image processing (manipulation) functio

 -   @subpage tutorial_bounding_rotated_ellipses

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
@ -257,6 +265,8 @@ In this section you will learn about the image processing (manipulation) functio

 -   @subpage tutorial_moments

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
@ -265,6 +275,8 @@ In this section you will learn about the image processing (manipulation) functio

 -   @subpage tutorial_point_polygon_test

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
--- a/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
+++ b/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
@ -86,3 +86,16 @@ When you run the code you should see 3x3 identity matrix as output.

 That is it, whenever you start a new project just add the OpenCV user library that you have defined
 to your project and you are good to go. Enjoy your powerful, less painful development environment :)
+
+Running Java code with OpenCV and MKL dependency
+------------------------------------------------
+
+You may get the following error (e.g. on Ubuntu) if you have built OpenCV with MKL library with some Java code that calls OpenCV functions
+that use Intel MKL:
+> Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so.
+
+One solution to solve this on Linux consists in preloading the Intel MKL library (either run the command in a terminal or add it to your `.bashrc` file).
+Your command line should be something similar to this (add `$LD_PRELOAD:` before if you have already set the `LD_PRELOAD` variable):
+> export LD_PRELOAD=/opt/intel/mkl/lib/intel64/libmkl_core.so:/opt/intel/mkl/lib/intel64/libmkl_sequential.so
+
+Then, run the Eclipse IDE from a terminal that have this environment variable set (`echo $LD_PRELOAD`) and the error should disappear.
--- a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
+++ b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
@ -17,9 +17,23 @@ Theory
 Code
 ----

+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp)
@include samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/objectDetection/cascade_classifier/ObjectDetectionDemo.java)
+@include samples/java/tutorial_code/objectDetection/cascade_classifier/ObjectDetectionDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/objectDetection/cascade_classifier/objectDetection.py)
+@include samples/python/tutorial_code/objectDetection/cascade_classifier/objectDetection.py
+@end_toggle

 Explanation
 -----------
@ -40,3 +54,13 @@ Result
    detection. For the eyes we keep using the file used in the tutorial.

    ![](images/Cascade_Classifier_Tutorial_Result_LBP.jpg)
+
+Additional Resources
+--------------------
+
+-#  Paul Viola and Michael J. Jones. Robust real-time face detection. International Journal of Computer Vision, 57(2):137–154, 2004. @cite Viola04
+-#  Rainer Lienhart and Jochen Maydt. An extended set of haar-like features for rapid object detection. In Image Processing. 2002. Proceedings. 2002 International Conference on, volume 1, pages I–900. IEEE, 2002. @cite Lienhart02
+-#  Video Lecture on [Face Detection and Tracking](https://www.youtube.com/watch?v=WfdYYNamHZ8)
+-#  An interesting interview regarding Face Detection by [Adam
+    Harvey](https://web.archive.org/web/20171204220159/http://www.makematics.com/research/viola-jones/)
+-#  [OpenCV Face Detection: Visualized](https://vimeo.com/12774628) on Vimeo by Adam Harvey
--- a/doc/tutorials/objdetect/table_of_content_objdetect.markdown
+++ b/doc/tutorials/objdetect/table_of_content_objdetect.markdown
@ -5,6 +5,8 @@ Ever wondered how your digital camera detects peoples and faces? Look here to fi

 -   @subpage tutorial_cascade_classifier

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
--- a/doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown
+++ b/doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown
@ -31,21 +31,51 @@ Exposure sequence
 Source Code
 -----------

-@include cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp
+@add_toggle_cpp
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp)
+@include samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java)
+@include samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py)
+@include samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py
+@end_toggle
+
+Sample images
+-------------
+
+Data directory that contains images, exposure times and `list.txt` file can be downloaded from
+[here](https://github.com/opencv/opencv_extra/tree/master/testdata/cv/hdr/exposures).

 Explanation
 -----------

-#  **Load images and exposure times**
-    @code{.cpp}
-    vector<Mat> images;
-    vector<float> times;
-    loadExposureSeq(argv[1], images, times);
-    @endcode
-    Firstly we load input images and exposure times from user-defined folder. The folder should
-    contain images and *list.txt* - file that contains file names and inverse exposure times.
+-   **Load images and exposure times**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Load images and exposure times
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Load images and exposure times
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Load images and exposure times
+@end_toggle
+
+Firstly we load input images and exposure times from user-defined folder. The folder should
+contain images and *list.txt* - file that contains file names and inverse exposure times.

-    For our image sequence the list is following:
+For our image sequence the list is following:
    @code{.none}
    memorial00.png 0.03125
    memorial01.png 0.0625
@ -53,53 +83,96 @@ Explanation
    memorial15.png 1024
    @endcode

-#  **Estimate camera response**
-    @code{.cpp}
-    Mat response;
-    Ptr<CalibrateDebevec> calibrate = createCalibrateDebevec();
-    calibrate->process(images, response, times);
-    @endcode
-    It is necessary to know camera response function (CRF) for a lot of HDR construction algorithms.
-    We use one of the calibration algorithms to estimate inverse CRF for all 256 pixel values.
-
-#  **Make HDR image**
-@code{.cpp}
-Mat hdr;
-Ptr<MergeDebevec> merge_debevec = createMergeDebevec();
-merge_debevec->process(images, hdr, times, response);
-@endcode
+-   **Estimate camera response**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Estimate camera response
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Estimate camera response
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Estimate camera response
+@end_toggle
+
+It is necessary to know camera response function (CRF) for a lot of HDR construction algorithms.
+We use one of the calibration algorithms to estimate inverse CRF for all 256 pixel values.
+
+-   **Make HDR image**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Make HDR image
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Make HDR image
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Make HDR image
+@end_toggle
+
 We use Debevec's weighting scheme to construct HDR image using response calculated in the previous
 item.

-#  **Tonemap HDR image**
-    @code{.cpp}
-    Mat ldr;
-    Ptr<TonemapDurand> tonemap = createTonemapDurand(2.2f);
-    tonemap->process(hdr, ldr);
-    @endcode
-    Since we want to see our results on common LDR display we have to map our HDR image to 8-bit range
-    preserving most details. It is the main goal of tonemapping methods. We use tonemapper with
-    bilateral filtering and set 2.2 as the value for gamma correction.
-
-#  **Perform exposure fusion**
-    @code{.cpp}
-    Mat fusion;
-    Ptr<MergeMertens> merge_mertens = createMergeMertens();
-    merge_mertens->process(images, fusion);
-    @endcode
-    There is an alternative way to merge our exposures in case when we don't need HDR image. This
-    process is called exposure fusion and produces LDR image that doesn't require gamma correction. It
-    also doesn't use exposure values of the photographs.
-
-#  **Write results**
-    @code{.cpp}
-    imwrite("fusion.png", fusion * 255);
-    imwrite("ldr.png", ldr * 255);
-    imwrite("hdr.hdr", hdr);
-    @endcode
-    Now it's time to look at the results. Note that HDR image can't be stored in one of common image
-    formats, so we save it to Radiance image (.hdr). Also all HDR imaging functions return results in
-    [0, 1] range so we should multiply result by 255.
+-   **Tonemap HDR image**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Tonemap HDR image
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Tonemap HDR image
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Tonemap HDR image
+@end_toggle
+
+Since we want to see our results on common LDR display we have to map our HDR image to 8-bit range
+preserving most details. It is the main goal of tonemapping methods. We use tonemapper with
+bilateral filtering and set 2.2 as the value for gamma correction.
+
+-   **Perform exposure fusion**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Perform exposure fusion
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Perform exposure fusion
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Perform exposure fusion
+@end_toggle
+
+There is an alternative way to merge our exposures in case when we don't need HDR image. This
+process is called exposure fusion and produces LDR image that doesn't require gamma correction. It
+also doesn't use exposure values of the photographs.
+
+-   **Write results**
+
+@add_toggle_cpp
+@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Write results
+@end_toggle
+
+@add_toggle_java
+@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Write results
+@end_toggle
+
+@add_toggle_python
+@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Write results
+@end_toggle
+
+Now it's time to look at the results. Note that HDR image can't be stored in one of common image
+formats, so we save it to Radiance image (.hdr). Also all HDR imaging functions return results in
+[0, 1] range so we should multiply result by 255.
+
+You can try other tonemap algorithms: cv::TonemapDrago, cv::TonemapDurand, cv::TonemapMantiuk and cv::TonemapReinhard
+You can also adjust the parameters in the HDR calibration and tonemap methods for your own photos.

 Results
 -------
@ -111,3 +184,12 @@ Results
 ### Exposure fusion

 ![](images/fusion.png)
+
+Additional Resources
+--------------------
+
+1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008. @cite DM97
+2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999. @cite RB99
+3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007. @cite MK07
+4. [Wikipedia-HDR](https://en.wikipedia.org/wiki/High-dynamic-range_imaging)
+5. [Recovering High Dynamic Range Radiance Maps from Photographs (webpage)](http://www.pauldebevec.com/Research/HDR/)
--- a/doc/tutorials/photo/table_of_content_photo.markdown
+++ b/doc/tutorials/photo/table_of_content_photo.markdown
@ -5,6 +5,8 @@ Use OpenCV for advanced photo processing.

 -   @subpage tutorial_hdr_imaging

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 3.0

    *Author:* Fedor Morozov
--- a/modules/core/misc/java/test/CoreTest.java
+++ b/modules/core/misc/java/test/CoreTest.java
@ -947,12 +947,16 @@ public class CoreTest extends OpenCVTestCase {
    }

    public void testMahalanobis() {
+        Mat src = new Mat(matSize, matSize, CvType.CV_32F);
+        Core.randu(src, -128, 128);
+
        Mat covar = new Mat(matSize, matSize, CvType.CV_32F);
        Mat mean = new Mat(1, matSize, CvType.CV_32F);
-        Core.calcCovarMatrix(grayRnd_32f, covar, mean, Core.COVAR_ROWS | Core.COVAR_NORMAL, CvType.CV_32F);
+        Core.calcCovarMatrix(src, covar, mean, Core.COVAR_ROWS | Core.COVAR_NORMAL, CvType.CV_32F);
        covar = covar.inv();
-        Mat line1 = grayRnd_32f.row(0);
-        Mat line2 = grayRnd_32f.row(1);
+
+        Mat line1 = src.row(0);
+        Mat line2 = src.row(1);

        double d = Core.Mahalanobis(line1, line1, covar);

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -463,9 +463,14 @@ static bool ipp_Mat_setTo_Mat(Mat &dst, Mat &_val, Mat &mask)
        return false;

    if (dst.depth() == CV_32F)
+    {
        for (int i = 0; i < (int)(_val.total()); i++)
-            if (_val.at<double>(i) < iwTypeGetMin(ipp32f) || _val.at<double>(i) > iwTypeGetMax(ipp32f))
+        {
+            float v = (float)(_val.at<double>(i));  // cast to float
+            if (cvIsNaN(v) || cvIsInf(v))  // accept finite numbers only
                return false;
+        }
+    }

    if(dst.dims <= 2)
    {
--- a/modules/core/src/logger.cpp
+++ b/modules/core/src/logger.cpp
@ -21,7 +21,13 @@ namespace logging {

 static LogLevel parseLogLevelConfiguration()
 {
-    static cv::String param_log_level = utils::getConfigurationParameterString("OPENCV_LOG_LEVEL", "INFO");
+    static cv::String param_log_level = utils::getConfigurationParameterString("OPENCV_LOG_LEVEL",
+#if defined NDEBUG
+            "WARNING"
+#else
+            "INFO"
+#endif
+    );
    if (param_log_level == "DISABLED" || param_log_level == "disabled" ||
        param_log_level == "0" || param_log_level == "OFF" || param_log_level == "off")
        return LOG_LEVEL_SILENT;
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -736,7 +736,6 @@ int64 getCPUTickCount(void)

 int64 getCPUTickCount(void)
 {
-    int64 result = 0;
    unsigned upper, lower, tmp;
    __asm__ volatile(
                     "0:                  \n"
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -1608,6 +1608,32 @@ TEST(Mat, regression_7873_mat_vector_initialize)
    ASSERT_EQ(2, sub_mat.size[2]);
 }

+TEST(Mat, regression_10507_mat_setTo)
+{
+    Size sz(6, 4);
+    Mat test_mask(sz, CV_8UC1, cv::Scalar::all(255));
+    test_mask.at<uchar>(1,0) = 0;
+    test_mask.at<uchar>(0,1) = 0;
+    for (int cn = 1; cn <= 4; cn++)
+    {
+        cv::Mat A(sz, CV_MAKE_TYPE(CV_32F, cn), cv::Scalar::all(5));
+        A.setTo(cv::Scalar::all(std::numeric_limits<float>::quiet_NaN()), test_mask);
+        int nans = 0;
+        for (int y = 0; y < A.rows; y++)
+        {
+            for (int x = 0; x < A.cols; x++)
+            {
+                for (int c = 0; c < cn; c++)
+                {
+                    float v = A.ptr<float>(y, x)[c];
+                    nans += (v == v) ? 0 : 1;
+                }
+            }
+        }
+        EXPECT_EQ(nans, cn * (sz.area() - 2)) << "A=" << A << std::endl << "mask=" << test_mask << std::endl;
+    }
+}
+
 TEST(Core_Mat_array, outputArray_create_getMat)
 {
    cv::Mat_<uchar> src_base(5, 1);
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -565,14 +565,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    };

    /**
-     * @brief Resize input 4-dimensional blob by nearest neighbor strategy.
+     * @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy.
     *
-     * Layer is used to support TensorFlow's resize_nearest_neighbor op.
+     * Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops.
     */
-    class CV_EXPORTS ResizeNearestNeighborLayer : public Layer
+    class CV_EXPORTS ResizeLayer : public Layer
    {
    public:
-        static Ptr<ResizeNearestNeighborLayer> create(const LayerParams& params);
+        static Ptr<ResizeLayer> create(const LayerParams& params);
    };

    class CV_EXPORTS ProposalLayer : public Layer
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -66,16 +66,22 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

    /**
     * @brief Enum of computation backends supported by layers.
+     * @see Net::setPreferableBackend
     */
    enum Backend
    {
+        //! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
+        //! OpenCV is built with Intel's Inference Engine library or
+        //! DNN_BACKEND_OPENCV otherwise.
        DNN_BACKEND_DEFAULT,
        DNN_BACKEND_HALIDE,
-        DNN_BACKEND_INFERENCE_ENGINE
+        DNN_BACKEND_INFERENCE_ENGINE,
+        DNN_BACKEND_OPENCV
    };

    /**
     * @brief Enum of target devices for computations.
+     * @see Net::setPreferableTarget
     */
    enum Target
    {
@ -460,6 +466,9 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         * @brief Ask network to use specific computation backend where it supported.
         * @param[in] backendId backend identifier.
         * @see Backend
+         *
+         * If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
+         * means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
         */
        CV_WRAP void setPreferableBackend(int backendId);

@ -467,6 +476,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         * @brief Ask network to make computations on specific target device.
         * @param[in] targetId target identifier.
         * @see Target
+         *
+         * List of supported combinations backend / target:
+         * |                        | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE |
+         * |------------------------|--------------------|------------------------------|--------------------|
+         * | DNN_TARGET_CPU         |                  + |                            + |                  + |
+         * | DNN_TARGET_OPENCL      |                  + |                            + |                  + |
+         * | DNN_TARGET_OPENCL_FP16 |                  + |                            + |                    |
+         * | DNN_TARGET_MYRIAD      |                    |                            + |                    |
         */
        CV_WRAP void setPreferableTarget(int targetId);

--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -10,9 +10,11 @@

 #include "opencv2/dnn/shape_utils.hpp"

+#include "../test/test_common.hpp"
+
 namespace opencv_test {

-CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
+CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, DNN_BACKEND_OPENCV)
 CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD)

 class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<DNNBackend, DNNTarget> >
@ -29,32 +31,10 @@ public:
        target = (dnn::Target)(int)get<1>(GetParam());
    }

-    static bool checkMyriadTarget()
-    {
-#ifndef HAVE_INF_ENGINE
-        return false;
-#endif
-        cv::dnn::Net net;
-        cv::dnn::LayerParams lp;
-        net.addLayerToPrev("testLayer", "Identity", lp);
-        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
-        net.setPreferableTarget(cv::dnn::DNN_TARGET_MYRIAD);
-        net.setInput(cv::Mat::zeros(1, 1, CV_32FC1));
-        try
-        {
-            net.forward();
-        }
-        catch(...)
-        {
-            return false;
-        }
-        return true;
-    }
-
    void processNet(std::string weights, std::string proto, std::string halide_scheduler,
                    const Mat& input, const std::string& outputLayer = "")
    {
-        if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
+        if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        {
 #if defined(HAVE_OPENCL)
            if (!cv::ocl::useOpenCL())
@ -149,7 +129,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
 PERF_TEST_P_(DNNTestNetwork, ENet)
 {
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
-        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    processNet("dnn/Enet-model-best.net", "", "enet.yml",
            Mat(cv::Size(512, 256), CV_32FC3));
@ -164,7 +144,8 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
 PERF_TEST_P_(DNNTestNetwork, OpenFace)
 {
    if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
        throw SkipTestException("");
    processNet("dnn/openface_nn4.small2.v1.t7", "", "",
            Mat(cv::Size(96, 96), CV_32FC3));
@ -178,13 +159,19 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
            Mat(cv::Size(300, 300), CV_32FC3));
 }

-// TODO: update MobileNet model.
-PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)
+PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE)
+    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
-    processNet("dnn/ssd_mobilenet_v1_coco.pb", "ssd_mobilenet_v1_coco.pbtxt", "",
+    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", "",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", "",
            Mat(cv::Size(300, 300), CV_32FC3));
 }

@ -237,9 +224,7 @@ PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)

 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
+    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
            Mat(cv::Size(300, 300), CV_32FC3));
@ -256,6 +241,23 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
    processNet("dnn/yolov3.cfg", "dnn/yolov3.weights", "", inp / 255);
 }

+PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+    processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+        throw SkipTestException("");
+    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
@ -267,9 +269,9 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
 #endif
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
 };

 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -395,9 +395,10 @@ namespace cv {
                {
                    cv::dnn::LayerParams param;
                    param.name = "Upsample-name";
-                    param.type = "ResizeNearestNeighbor";
+                    param.type = "Resize";

                    param.set<int>("zoom_factor", scaleFactor);
+                    param.set<String>("interpolation", "nearest");

                    darknet::LayerParameter lp;
                    std::string layer_name = cv::format("upsample_%d", layer_id);
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -225,7 +225,7 @@ void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
 class OpenCLBackendWrapper : public BackendWrapper
 {
 public:
-    OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+    OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
    {
        m.copyTo(umat);
        host = &m;
@ -233,7 +233,7 @@ public:
    }

    OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
-        : BackendWrapper(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+        : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
    {
        Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
        CV_Assert(!base.empty());
@ -654,7 +654,7 @@ private:

 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
 {
-    if (backendId == DNN_BACKEND_DEFAULT)
+    if (backendId == DNN_BACKEND_OPENCV)
    {
        if (targetId == DNN_TARGET_CPU)
            return Ptr<BackendWrapper>();
@ -727,7 +727,7 @@ struct Net::Impl

    Ptr<BackendWrapper> wrap(Mat& host)
    {
-        if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU)
+        if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
            return Ptr<BackendWrapper>();

        MatShape shape(host.dims);
@ -738,7 +738,7 @@ struct Net::Impl
        if (backendWrappers.find(data) != backendWrappers.end())
        {
            Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
-            if (preferableBackend == DNN_BACKEND_DEFAULT)
+            if (preferableBackend == DNN_BACKEND_OPENCV)
            {
                CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
                return OpenCLBackendWrapper::create(baseBuffer, host);
@ -850,9 +850,27 @@ struct Net::Impl
    {
        CV_TRACE_FUNCTION();

+        if (preferableBackend == DNN_BACKEND_DEFAULT)
+#ifdef HAVE_INF_ENGINE
+            preferableBackend = DNN_BACKEND_INFERENCE_ENGINE;
+#else
+            preferableBackend = DNN_BACKEND_OPENCV;
+#endif
+        CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
+                  preferableTarget == DNN_TARGET_CPU ||
+                  preferableTarget == DNN_TARGET_OPENCL ||
+                  preferableTarget == DNN_TARGET_OPENCL_FP16);
+        CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
+                  preferableTarget == DNN_TARGET_CPU ||
+                  preferableTarget == DNN_TARGET_OPENCL);
+        CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
+                  preferableTarget == DNN_TARGET_CPU ||
+                  preferableTarget == DNN_TARGET_OPENCL ||
+                  preferableTarget == DNN_TARGET_OPENCL_FP16 ||
+                  preferableTarget == DNN_TARGET_MYRIAD);
        if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
        {
-            if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
+            if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
 #ifndef HAVE_OPENCL
            {
                CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
@ -1036,7 +1054,7 @@ struct Net::Impl
    void initBackend()
    {
        CV_TRACE_FUNCTION();
-        if (preferableBackend == DNN_BACKEND_DEFAULT)
+        if (preferableBackend == DNN_BACKEND_OPENCV)
            CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
        else if (preferableBackend == DNN_BACKEND_HALIDE)
            initHalideBackend();
@ -1375,7 +1393,7 @@ struct Net::Impl
        std::vector<LayerPin> pinsForInternalBlobs;
        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
                                          preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
-                                          preferableBackend == DNN_BACKEND_DEFAULT &&
+                                          preferableBackend == DNN_BACKEND_OPENCV &&
                                          preferableTarget == DNN_TARGET_OPENCL_FP16);
        ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
        for (int i = 0; i < ld.outputBlobs.size(); ++i)
@ -1418,7 +1436,7 @@ struct Net::Impl

    void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
    {
-        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT &&
+        if( !fusion || preferableBackend != DNN_BACKEND_OPENCV &&
                       preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
            return;

@ -1446,7 +1464,7 @@ struct Net::Impl
            // some other layers.

            // TODO: OpenCL target support more fusion styles.
-            if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
+            if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
                 ld.layerInstance->type != "MVN")) )
                continue;
@ -1481,7 +1499,7 @@ struct Net::Impl
                        break;
                }

-                if (preferableBackend != DNN_BACKEND_DEFAULT)
+                if (preferableBackend != DNN_BACKEND_OPENCV)
                    continue;  // Go to the next layer.

                // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
@ -1624,7 +1642,7 @@ struct Net::Impl
                }
            }

-            if (preferableBackend != DNN_BACKEND_DEFAULT)
+            if (preferableBackend != DNN_BACKEND_OPENCV)
                continue;  // Go to the next layer.

            // the optimization #2. if there is no layer that takes max pooling layer's computed
@ -1735,7 +1753,7 @@ struct Net::Impl
        {
            CV_Assert(layers[0].outputBlobs[i].total());
            if (layers[0].outputBlobs[i].depth() == CV_32F &&
-                preferableBackend == DNN_BACKEND_DEFAULT &&
+                preferableBackend == DNN_BACKEND_OPENCV &&
                preferableTarget == DNN_TARGET_OPENCL_FP16)
            {
                Mat mat = layers[0].outputBlobs[i].clone();
@ -1781,12 +1799,12 @@ struct Net::Impl
        TickMeter tm;
        tm.start();

-        if (preferableBackend == DNN_BACKEND_DEFAULT ||
+        if (preferableBackend == DNN_BACKEND_OPENCV ||
            !layer->supportBackend(preferableBackend))
        {
            if( !ld.skip )
            {
-                if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
+                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
                    layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@ -2132,7 +2150,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
    {
        std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();

-        if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
+        if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
            IS_DNN_OPENCL_TARGET(impl->preferableTarget))
        {
            if (impl->preferableTarget == DNN_TARGET_OPENCL)
@ -2234,7 +2252,13 @@ void Net::setPreferableTarget(int targetId)
        if (IS_DNN_OPENCL_TARGET(targetId))
        {
 #ifndef HAVE_OPENCL
-            impl->preferableTarget = DNN_TARGET_CPU;
+#ifdef HAVE_INF_ENGINE
+            if (impl->preferableBackend == DNN_BACKEND_OPENCV)
+#else
+            if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
+                impl->preferableBackend == DNN_BACKEND_OPENCV)
+#endif  // HAVE_INF_ENGINE
+                impl->preferableTarget = DNN_TARGET_CPU;
 #else
            bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
            if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
@ -2270,7 +2294,7 @@ void Net::setInput(InputArray blob, const String& name)
    ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
    MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
    Mat blob_;
-    if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
+    if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
        impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
    {
        Mat blob_mat = blob.getMat();
@ -2664,7 +2688,7 @@ int Layer::outputNameToIndex(const String&)

 bool Layer::supportBackend(int backendId)
 {
-    return backendId == DNN_BACKEND_DEFAULT;
+    return backendId == DNN_BACKEND_OPENCV;
 }

 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@ -83,7 +83,7 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(Concat,         ConcatLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Reshape,        ReshapeLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Flatten,        FlattenLayer);
-    CV_DNN_REGISTER_LAYER_CLASS(ResizeNearestNeighbor, ResizeNearestNeighborLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Resize,         ResizeLayer);
    CV_DNN_REGISTER_LAYER_CLASS(CropAndResize,  CropAndResizeLayer);

    CV_DNN_REGISTER_LAYER_CLASS(Convolution,    ConvolutionLayer);
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -96,6 +96,46 @@ public:
        shift = bias_;
    }

+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if (w.empty() && b.empty())
+            return false;
+
+        const int numChannels = weights_.total();
+        const int numFusedWeights = w.total();
+        const int numFusedBias = b.total();
+
+        if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
+            (numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
+            return false;
+
+        if (!w.empty())
+        {
+            w = w.reshape(1, 1);
+            if (numFusedWeights == 1)
+            {
+                multiply(weights_, w.at<float>(0), weights_);
+                multiply(bias_, w.at<float>(0), bias_);
+            }
+            else
+            {
+                multiply(weights_, w, weights_);
+                multiply(bias_, w, bias_);
+            }
+        }
+        if (!b.empty())
+        {
+            b = b.reshape(1, 1);
+            if (numFusedBias == 1)
+                add(bias_, b.at<float>(0), bias_);
+            else
+                add(bias_, b.reshape(1, 1), bias_);
+        }
+        return true;
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -109,7 +149,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
    }
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -56,7 +56,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
    }

--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -103,7 +103,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding ||  // By channels
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !padding;
    }
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -81,9 +81,10 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
-               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return preferableTarget != DNN_TARGET_MYRIAD || type != "Deconvolution" || adjustPad == Size();
+        else
+            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
    }

    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
@ -737,8 +738,9 @@ public:

                            if( relu )
                            {
-                                r0 = relu[i];
-                                r1 = relu[i+1];
+                                r0 = relu[i]; r1 = relu[i+1];
+                                if( i+1 >= outCn )
+                                    r1 = r0;
                            }

                            int j = 0;
@ -1568,6 +1570,39 @@ public:
        return Ptr<BackendNode>();
    }

+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
+        const int group = numOutput / outGroupCn;
+
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Deconvolution";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::DeconvolutionLayer> ieLayer(new InferenceEngine::DeconvolutionLayer(lp));
+
+        ieLayer->_kernel_x = kernel.width;
+        ieLayer->_kernel_y = kernel.height;
+        ieLayer->_stride_x = stride.width;
+        ieLayer->_stride_y = stride.height;
+        ieLayer->_out_depth = numOutput;
+        ieLayer->_padding_x = pad.width;
+        ieLayer->_padding_y = pad.height;
+        ieLayer->_dilation_x = dilation.width;
+        ieLayer->_dilation_y = dilation.height;
+        ieLayer->_group = group;
+
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (hasBias())
+        {
+            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C);
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@ -68,7 +68,7 @@ public:
            {
                float input_y = top * (inpHeight - 1) + y * heightScale;
                int y0 = static_cast<int>(input_y);
-                const float* inpData_row0 = (float*)inp.data + y0 * inpWidth;
+                const float* inpData_row0 = inp.ptr<float>(0, 0, y0);
                const float* inpData_row1 = (y0 + 1 < inpHeight) ? (inpData_row0 + inpWidth) : inpData_row0;
                for (int x = 0; x < outWidth; ++x)
                {
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -195,7 +195,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !_locPredTransposed;
    }

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -115,9 +115,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
-               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+        return func.supportBackend(backendId, this->preferableTarget);
    }

    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
@ -238,6 +236,12 @@ struct ReLUFunctor

    explicit ReLUFunctor(float slope_=1.f) : slope(slope_) {}

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        float s = slope;
@ -353,6 +357,12 @@ struct ReLU6Functor
        CV_Assert(minValue <= maxValue);
    }

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -445,6 +455,12 @@ struct TanHFunctor
 {
    typedef TanHLayer Layer;

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -496,8 +512,9 @@ struct TanHFunctor
 #ifdef HAVE_INF_ENGINE
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
-        CV_Error(Error::StsNotImplemented, "TanH");
-        return InferenceEngine::CNNLayerPtr();
+        lp.type = "TanH";
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
+        return ieLayer;
    }
 #endif  // HAVE_INF_ENGINE

@ -508,6 +525,12 @@ struct SigmoidFunctor
 {
    typedef SigmoidLayer Layer;

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -574,6 +597,11 @@ struct ELUFunctor

    explicit ELUFunctor() {}

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -637,6 +665,11 @@ struct AbsValFunctor
 {
    typedef AbsLayer Layer;

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -700,6 +733,11 @@ struct BNLLFunctor
 {
    typedef BNLLLayer Layer;

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -750,6 +788,14 @@ struct PowerFunctor
    explicit PowerFunctor(float power_ = 1.f, float scale_ = 1.f, float shift_ = 0.f)
        : power(power_), scale(scale_), shift(shift_) {}

+    bool supportBackend(int backendId, int targetId)
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return (targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16) || power == 1.0;
+        else
+            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        float a = scale, b = shift, p = power;
@ -852,6 +898,11 @@ struct ChannelsPReLUFunctor
        scale_umat = scale.getUMat(ACCESS_READ);
    }

+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
        CV_Assert(scale.isContinuous() && scale.type() == CV_32F);
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -96,7 +96,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
    }
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -64,7 +64,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
    }

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -128,7 +128,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1;
    }
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@ -101,9 +101,13 @@ void fastConv( const float* weights, size_t wstep, const float* bias,

        if( relu )
        {
-            r0 = relu[i];
-            r1 = relu[i+1];
-            r2 = relu[i+2];
+            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
+            if( i+2 >= outCn )
+            {
+                r2 = r1;
+                if( i+1 >= outCn )
+                    r2 = r1 = r0;
+            }
            vr0 = _mm_set1_ps(r0);
            vr1 = _mm_set1_ps(r1);
            vr2 = _mm_set1_ps(r2);
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -90,7 +90,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
    }
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@ -34,7 +34,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_HALIDE && haveHalide() &&
               !poolPad.width && !poolPad.height;
    }
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -63,7 +63,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_DEFAULT ||
+        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() &&
               pnorm == 2 && !blobs.empty();
    }
--- a/Show More
+++ b/Show More