Merge remote-tracking branch 'upstream/3.4' into merge-3.4

6 years ago · a74fe2ec01
parent a052567db8 eabbe38001
commit a74fe2ec01
36 changed files with 4716 additions and 60 deletions
--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@ -46,6 +46,15 @@ if(";${CPU_BASELINE_FINAL};" MATCHES "SSE2"
  add_definitions(-DPNG_INTEL_SSE)
 endif()
 # set definitions and sources for MIPS
 if(";${CPU_BASELINE_FINAL};" MATCHES "MSA")
    list(APPEND lib_srcs mips/mips_init.c mips/filter_msa_intrinsics.c)
    add_definitions(-DPNG_MIPS_MSA_OPT=2)
    ocv_warnings_disable(CMAKE_C_FLAGS -Wshadow)
 else()
    add_definitions(-DPNG_MIPS_MSA_OPT=0)
 endif()
 if(PPC64LE OR PPC64)
  # VSX3 features are backwards compatible
  if(";${CPU_BASELINE_FINAL};" MATCHES "VSX.*"
--- a/3rdparty/libpng/mips/filter_msa_intrinsics.c
+++ b/3rdparty/libpng/mips/filter_msa_intrinsics.c
@ -0,0 +1,808 @@
 /* filter_msa_intrinsics.c - MSA optimised filter functions
 *
 * Copyright (c) 2018 Cosmin Truta
 * Copyright (c) 2016 Glenn Randers-Pehrson
 * Written by Mandar Sahastrabuddhe, August 2016.
 *
 * This code is released under the libpng license.
 * For conditions of distribution and use, see the disclaimer
 * and license in png.h
 */
 #include <stdio.h>
 #include <stdint.h>
 #include "../pngpriv.h"
 #ifdef PNG_READ_SUPPORTED
 /* This code requires -mfpu=msa on the command line: */
 #if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
 #include <msa.h>
 /* libpng row pointers are not necessarily aligned to any particular boundary,
 * however this code will only work with appropriate alignment. mips/mips_init.c
 * checks for this (and will not compile unless it is done). This code uses
 * variants of png_aligncast to avoid compiler warnings.
 */
 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
 /* The following relies on a variable 'temp_pointer' being declared with type
 * 'type'.  This is written this way just to hide the GCC strict aliasing
 * warning; note that the code is safe because there never is an alias between
 * the input and output pointers.
 */
 #define png_ldr(type,pointer)\
   (temp_pointer = png_ptr(type,pointer), *temp_pointer)
 #if PNG_MIPS_MSA_OPT > 0
 #ifdef CLANG_BUILD
   #define MSA_SRLI_B(a, b)   __msa_srli_b((v16i8) a, b)
   #define LW(psrc)                              \
   ( {                                           \
       uint8_t *psrc_lw_m = (uint8_t *) (psrc);  \
       uint32_t val_m;                           \
                                                 \
       asm volatile (                            \
           "lw  %[val_m],  %[psrc_lw_m]  \n\t"   \
                                                 \
           : [val_m] "=r" (val_m)                \
           : [psrc_lw_m] "m" (*psrc_lw_m)        \
       );                                        \
                                                 \
       val_m;                                    \
   } )
   #define SH(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sh_m = (uint8_t *) (pdst);  \
       uint16_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "sh  %[val_m],  %[pdst_sh_m]  \n\t"   \
                                                 \
           : [pdst_sh_m] "=m" (*pdst_sh_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
   #define SW(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sw_m = (uint8_t *) (pdst);  \
       uint32_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "sw  %[val_m],  %[pdst_sw_m]  \n\t"   \
                                                 \
           : [pdst_sw_m] "=m" (*pdst_sw_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
       #if (__mips == 64)
        #define SD(val, pdst)                         \
        {                                             \
            uint8_t *pdst_sd_m = (uint8_t *) (pdst);  \
            uint64_t val_m = (val);                   \
                                                      \
            asm volatile (                            \
                "sd  %[val_m],  %[pdst_sd_m]  \n\t"   \
                                                      \
                : [pdst_sd_m] "=m" (*pdst_sd_m)       \
                : [val_m] "r" (val_m)                 \
            );                                        \
        }
    #else
        #define SD(val, pdst)                                          \
        {                                                              \
            uint8_t *pdst_sd_m = (uint8_t *) (pdst);                   \
            uint32_t val0_m, val1_m;                                   \
                                                                       \
            val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
            val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
                                                                       \
            SW(val0_m, pdst_sd_m);                                     \
            SW(val1_m, pdst_sd_m + 4);                                 \
        }
    #endif
 #else
   #define MSA_SRLI_B(a, b)   (a >> b)
 #if (__mips_isa_rev >= 6)
   #define LW(psrc)                              \
   ( {                                           \
       uint8_t *psrc_lw_m = (uint8_t *) (psrc);  \
       uint32_t val_m;                           \
                                                 \
       asm volatile (                            \
           "lw  %[val_m],  %[psrc_lw_m]  \n\t"   \
                                                 \
           : [val_m] "=r" (val_m)                \
           : [psrc_lw_m] "m" (*psrc_lw_m)        \
       );                                        \
                                                 \
       val_m;                                    \
   } )
   #define SH(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sh_m = (uint8_t *) (pdst);  \
       uint16_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "sh  %[val_m],  %[pdst_sh_m]  \n\t"   \
                                                 \
           : [pdst_sh_m] "=m" (*pdst_sh_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
   #define SW(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sw_m = (uint8_t *) (pdst);  \
       uint32_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "sw  %[val_m],  %[pdst_sw_m]  \n\t"   \
                                                 \
           : [pdst_sw_m] "=m" (*pdst_sw_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
   #if (__mips == 64)
        #define SD(val, pdst)                         \
        {                                             \
            uint8_t *pdst_sd_m = (uint8_t *) (pdst);  \
            uint64_t val_m = (val);                   \
                                                      \
            asm volatile (                            \
                "sd  %[val_m],  %[pdst_sd_m]  \n\t"   \
                                                      \
                : [pdst_sd_m] "=m" (*pdst_sd_m)       \
                : [val_m] "r" (val_m)                 \
            );                                        \
        }
    #else
        #define SD(val, pdst)                                          \
        {                                                              \
            uint8_t *pdst_sd_m = (uint8_t *) (pdst);                   \
            uint32_t val0_m, val1_m;                                   \
                                                                       \
            val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
            val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
                                                                       \
            SW(val0_m, pdst_sd_m);                                     \
            SW(val1_m, pdst_sd_m + 4);                                 \
        }
    #endif
 #else  // !(__mips_isa_rev >= 6)
   #define LW(psrc)                              \
   ( {                                           \
       uint8_t *psrc_lw_m = (uint8_t *) (psrc);  \
       uint32_t val_m;                           \
                                                 \
       asm volatile (                            \
           "ulw  %[val_m],  %[psrc_lw_m]  \n\t"  \
                                                 \
           : [val_m] "=r" (val_m)                \
           : [psrc_lw_m] "m" (*psrc_lw_m)        \
       );                                        \
                                                 \
       val_m;                                    \
   } )
   #define SH(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sh_m = (uint8_t *) (pdst);  \
       uint16_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "ush  %[val_m],  %[pdst_sh_m]  \n\t"  \
                                                 \
           : [pdst_sh_m] "=m" (*pdst_sh_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
   #define SW(val, pdst)                         \
   {                                             \
       uint8_t *pdst_sw_m = (uint8_t *) (pdst);  \
       uint32_t val_m = (val);                   \
                                                 \
       asm volatile (                            \
           "usw  %[val_m],  %[pdst_sw_m]  \n\t"  \
                                                 \
           : [pdst_sw_m] "=m" (*pdst_sw_m)       \
           : [val_m] "r" (val_m)                 \
       );                                        \
   }
   #define SD(val, pdst)                                          \
    {                                                              \
        uint8_t *pdst_sd_m = (uint8_t *) (pdst);                   \
        uint32_t val0_m, val1_m;                                   \
                                                                   \
        val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
        val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
                                                                   \
        SW(val0_m, pdst_sd_m);                                     \
        SW(val1_m, pdst_sd_m + 4);                                 \
    }
    #define SW_ZERO(pdst)                      \
    {                                          \
        uint8_t *pdst_m = (uint8_t *) (pdst);  \
                                               \
        asm volatile (                         \
            "usw  $0,  %[pdst_m]  \n\t"        \
                                               \
            : [pdst_m] "=m" (*pdst_m)          \
            :                                  \
        );                                     \
    }
 #endif  // (__mips_isa_rev >= 6)
 #endif
 #define LD_B(RTYPE, psrc) *((RTYPE *) (psrc))
 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \
 {                                               \
    out0 = LD_B(RTYPE, (psrc));                 \
    out1 = LD_B(RTYPE, (psrc) + stride);        \
 }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
 {                                                            \
    LD_B2(RTYPE, (psrc), stride, out0, out1);                \
    LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
 }
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in)
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \
 {                                             \
    ST_B(RTYPE, in0, (pdst));                 \
    ST_B(RTYPE, in1, (pdst) + stride);        \
 }
 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 {                                                         \
    ST_B2(RTYPE, in0, in1, (pdst), stride);               \
    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
 }
 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 #define ADD2(in0, in1, in2, in3, out0, out1)  \
 {                                             \
    out0 = in0 + in1;                         \
    out1 = in2 + in3;                         \
 }
 #define ADD3(in0, in1, in2, in3, in4, in5,  \
             out0, out1, out2)              \
 {                                           \
    ADD2(in0, in1, in2, in3, out0, out1);   \
    out2 = in4 + in5;                       \
 }
 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
             out0, out1, out2, out3)                  \
 {                                                     \
    ADD2(in0, in1, in2, in3, out0, out1);             \
    ADD2(in4, in5, in6, in7, out2, out3);             \
 }
 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 {                                                           \
    out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
    out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
 }
 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
 {                                                             \
    out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
    out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
 }
 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \
 {                                                                         \
    v16i8 zero_m = { 0 };                                                 \
    out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \
    out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \
 }
 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
 {                                                                         \
    v16i8 zero_m = { 0 };                                                 \
    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
    out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
 }
 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
 {                                                            \
    out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
    out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
 }
 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
 #define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2)  \
 {                                                           \
    RTYPE zero = {0};                                       \
                                                            \
    out0 = __msa_add_a_h((v8i16) zero, in0);                \
    out1 = __msa_add_a_h((v8i16) zero, in1);                \
    out2 = __msa_add_a_h((v8i16) zero, in2);                \
 }
 #define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__)
 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
 {                                                                          \
    out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
    out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0)              \
 {                                                                             \
   v8i16 _sel_h0, _sel_h1;                                                    \
   v16u8 _sel_b0, _sel_b1;                                                    \
   _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0);               \
   _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0);         \
   inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0);  \
   inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0);                          \
   _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0);               \
   _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1);         \
   inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1);                          \
   out0 += inp4;                                                              \
 }
 void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
                                png_const_bytep prev_row)
 {
   size_t i, cnt, cnt16, cnt32;
   size_t istop = row_info->rowbytes;
   png_bytep rp = row;
   png_const_bytep pp = prev_row;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   for (i = 0; i < (istop >> 6); i++)
   {
      LD_UB4(rp, 16, src0, src1, src2, src3);
      LD_UB4(pp, 16, src4, src5, src6, src7);
      pp += 64;
 	  ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
 	       src0, src1, src2, src3);
      ST_UB4(src0, src1, src2, src3, rp, 16);
      rp += 64;
   }
   if (istop & 0x3F)
   {
      cnt32 = istop & 0x20;
      cnt16 = istop & 0x10;
      cnt = istop & 0xF;
      if(cnt32)
      {
         if (cnt16 && cnt)
         {
            LD_UB4(rp, 16, src0, src1, src2, src3);
            LD_UB4(pp, 16, src4, src5, src6, src7);
            ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
 	             src0, src1, src2, src3);
            ST_UB4(src0, src1, src2, src3, rp, 16);
            rp += 64;
         }
         else if (cnt16 || cnt)
         {
            LD_UB2(rp, 16, src0, src1);
            LD_UB2(pp, 16, src4, src5);
            pp += 32;
            src2 = LD_UB(rp + 32);
            src6 = LD_UB(pp);
            ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2);
            ST_UB2(src0, src1, rp, 16);
            rp += 32;
            ST_UB(src2, rp);
            rp += 16;
         }
         else
         {
            LD_UB2(rp, 16, src0, src1);
            LD_UB2(pp, 16, src4, src5);
 			ADD2(src0, src4, src1, src5, src0, src1);
            ST_UB2(src0, src1, rp, 16);
            rp += 32;
         }
      }
      else if (cnt16 && cnt)
      {
         LD_UB2(rp, 16, src0, src1);
         LD_UB2(pp, 16, src4, src5);
         ADD2(src0, src4, src1, src5, src0, src1);
         ST_UB2(src0, src1, rp, 16);
         rp += 32;
      }
      else if (cnt16 || cnt)
      {
         src0 = LD_UB(rp);
         src4 = LD_UB(pp);
         pp += 16;
         src0 += src4;
         ST_UB(src0, rp);
         rp += 16;
      }
   }
 }
 void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row,
                                  png_const_bytep prev_row)
 {
   size_t count;
   size_t istop = row_info->rowbytes;
   png_bytep src = row;
   png_bytep nxt = row + 4;
   int32_t inp0;
   v16u8 src0, src1, src2, src3, src4;
   v16u8 dst0, dst1;
   v16u8 zero = { 0 };
   istop -= 4;
   inp0 = LW(src);
   src += 4;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   for (count = 0; count < istop; count += 16)
   {
      src1 = LD_UB(src);
      src += 16;
      src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4);
      src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8);
      src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12);
      src1 += src0;
      src2 += src1;
      src3 += src2;
      src4 += src3;
      src0 = src4;
      ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1);
      dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
      ST_UB(dst0, nxt);
      nxt += 16;
   }
 }
 void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row,
                                  png_const_bytep prev_row)
 {
   size_t count;
   size_t istop = row_info->rowbytes;
   png_bytep src = row;
   png_bytep nxt = row + 3;
   int64_t out0;
   int32_t inp0, out1;
   v16u8 src0, src1, src2, src3, src4, dst0, dst1;
   v16u8 zero = { 0 };
   v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
   v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
   istop -= 3;
   inp0 = LW(src);
   src += 3;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   for (count = 0; count < istop; count += 12)
   {
      src1 = LD_UB(src);
      src += 12;
      src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3);
      src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6);
      src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9);
      src1 += src0;
      src2 += src1;
      src3 += src2;
      src4 += src3;
      src0 = src4;
      VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1);
      dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
      out0 = __msa_copy_s_d((v2i64) dst0, 0);
      out1 = __msa_copy_s_w((v4i32) dst0, 2);
      SD(out0, nxt);
      nxt += 8;
      SW(out1, nxt);
      nxt += 4;
   }
 }
 void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row,
                                  png_const_bytep prev_row)
 {
   size_t i;
   png_bytep src = row;
   png_bytep nxt = row;
   png_const_bytep pp = prev_row;
   size_t istop = row_info->rowbytes - 4;
   int32_t inp0, inp1, out0;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
   v16u8 zero = { 0 };
   inp0 = LW(pp);
   pp += 4;
   inp1 = LW(src);
   src += 4;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
   src0 = (v16u8) MSA_SRLI_B(src0, 1);
   src1 += src0;
   out0 = __msa_copy_s_w((v4i32) src1, 0);
   SW(out0, nxt);
   nxt += 4;
   for (i = 0; i < istop; i += 16)
   {
      src2 = LD_UB(pp);
      pp += 16;
      src6 = LD_UB(src);
      src += 16;
      SLDI_B2_0_UB(src2, src6, src3, src7, 4);
      SLDI_B2_0_UB(src2, src6, src4, src8, 8);
      SLDI_B2_0_UB(src2, src6, src5, src9, 12);
      src2 = __msa_ave_u_b(src2, src1);
      src6 += src2;
      src3 = __msa_ave_u_b(src3, src6);
      src7 += src3;
      src4 = __msa_ave_u_b(src4, src7);
      src8 += src4;
      src5 = __msa_ave_u_b(src5, src8);
      src9 += src5;
      src1 = src9;
      ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1);
      dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
      ST_UB(dst0, nxt);
      nxt += 16;
   }
 }
 void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row,
                                  png_const_bytep prev_row)
 {
   size_t i;
   png_bytep src = row;
   png_bytep nxt = row;
   png_const_bytep pp = prev_row;
   size_t istop = row_info->rowbytes - 3;
   int64_t out0;
   int32_t inp0, inp1, out1;
   int16_t out2;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
   v16u8 zero = { 0 };
   v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
   v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
   inp0 = LW(pp);
   pp += 3;
   inp1 = LW(src);
   src += 3;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
   src0 = (v16u8) MSA_SRLI_B(src0, 1);
   src1 += src0;
   out2 = __msa_copy_s_h((v8i16) src1, 0);
   SH(out2, nxt);
   nxt += 2;
   nxt[0] = src1[2];
   nxt++;
   for (i = 0; i < istop; i += 12)
   {
      src2 = LD_UB(pp);
      pp += 12;
      src6 = LD_UB(src);
      src += 12;
      SLDI_B2_0_UB(src2, src6, src3, src7, 3);
      SLDI_B2_0_UB(src2, src6, src4, src8, 6);
      SLDI_B2_0_UB(src2, src6, src5, src9, 9);
      src2 = __msa_ave_u_b(src2, src1);
      src6 += src2;
      src3 = __msa_ave_u_b(src3, src6);
      src7 += src3;
      src4 = __msa_ave_u_b(src4, src7);
      src8 += src4;
      src5 = __msa_ave_u_b(src5, src8);
      src9 += src5;
      src1 = src9;
      VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1);
      dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
      out0 = __msa_copy_s_d((v2i64) dst0, 0);
      out1 = __msa_copy_s_w((v4i32) dst0, 2);
      SD(out0, nxt);
      nxt += 8;
      SW(out1, nxt);
      nxt += 4;
   }
 }
 void png_read_filter_row_paeth4_msa(png_row_infop row_info,
                                    png_bytep row,
                                    png_const_bytep prev_row)
 {
   int32_t count, rp_end;
   png_bytep nxt;
   png_const_bytep prev_nxt;
   int32_t inp0, inp1, res0;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
   v16u8 src10, src11, src12, src13, dst0, dst1;
   v8i16 vec0, vec1, vec2;
   v16u8 zero = { 0 };
   nxt = row;
   prev_nxt = prev_row;
   inp0 = LW(nxt);
   inp1 = LW(prev_nxt);
   prev_nxt += 4;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
   src1 += src0;
   res0 = __msa_copy_s_w((v4i32) src1, 0);
   SW(res0, nxt);
   nxt += 4;
   /* Remainder */
   rp_end = row_info->rowbytes - 4;
   for (count = 0; count < rp_end; count += 16)
   {
      src2 = LD_UB(prev_nxt);
      prev_nxt += 16;
      src6 = LD_UB(prev_row);
      prev_row += 16;
      src10 = LD_UB(nxt);
      SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4);
      SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8);
      SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12);
      ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
      ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
      ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
      ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
      src1 = src13;
      ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1);
      dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
      ST_UB(dst0, nxt);
      nxt += 16;
   }
 }
 void png_read_filter_row_paeth3_msa(png_row_infop row_info,
                                    png_bytep row,
                                    png_const_bytep prev_row)
 {
   int32_t count, rp_end;
   png_bytep nxt;
   png_const_bytep prev_nxt;
   int64_t out0;
   int32_t inp0, inp1, out1;
   int16_t out2;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
   v16u8 src10, src11, src12, src13;
   v8i16 vec0, vec1, vec2;
   v16u8 zero = { 0 };
   v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
   v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
   nxt = row;
   prev_nxt = prev_row;
   inp0 = LW(nxt);
   inp1 = LW(prev_nxt);
   prev_nxt += 3;
   src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
   src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
   src1 += src0;
   out2 = __msa_copy_s_h((v8i16) src1, 0);
   SH(out2, nxt);
   nxt += 2;
   nxt[0] = src1[2];
   nxt++;
   /* Remainder */
   rp_end = row_info->rowbytes - 3;
   for (count = 0; count < rp_end; count += 12)
   {
      src2 = LD_UB(prev_nxt);
      prev_nxt += 12;
      src6 = LD_UB(prev_row);
      prev_row += 12;
      src10 = LD_UB(nxt);
      SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3);
      SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6);
      SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9);
      ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
      ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
      ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
      ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
      HSUB_UB2_SH(vec0, vec1, vec0, vec1);
      vec2 = vec0 + vec1;
      ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
      CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
      src1 = src13;
      VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1);
      dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
      out0 = __msa_copy_s_d((v2i64) dst0, 0);
      out1 = __msa_copy_s_w((v4i32) dst0, 2);
      SD(out0, nxt);
      nxt += 8;
      SW(out1, nxt);
      nxt += 4;
   }
 }
 #endif /* PNG_MIPS_MSA_OPT > 0 */
 #endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */
 #endif /* READ */
--- a/3rdparty/libpng/mips/mips_init.c
+++ b/3rdparty/libpng/mips/mips_init.c
@ -0,0 +1,127 @@
 /* mips_init.c - MSA optimised filter functions
 *
 * Copyright (c) 2018 Cosmin Truta
 * Copyright (c) 2016 Glenn Randers-Pehrson
 * Written by Mandar Sahastrabuddhe, 2016.
 *
 * This code is released under the libpng license.
 * For conditions of distribution and use, see the disclaimer
 * and license in png.h
 */
 /* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are
 * called.
 */
 #define _POSIX_SOURCE 1
 #include <stdio.h>
 #include "../pngpriv.h"
 #ifdef PNG_READ_SUPPORTED
 #if PNG_MIPS_MSA_OPT > 0
 #ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */
 /* WARNING: it is strongly recommended that you do not build libpng with
 * run-time checks for CPU features if at all possible.  In the case of the MIPS
 * MSA instructions there is no processor-specific way of detecting the
 * presence of the required support, therefore run-time detection is extremely
 * OS specific.
 *
 * You may set the macro PNG_MIPS_MSA_FILE to the file name of file containing
 * a fragment of C source code which defines the png_have_msa function.  There
 * are a number of implementations in contrib/mips-msa, but the only one that
 * has partial support is contrib/mips-msa/linux.c - a generic Linux
 * implementation which reads /proc/cpufino.
 */
 #ifndef PNG_MIPS_MSA_FILE
 #  ifdef __linux__
 #     define PNG_MIPS_MSA_FILE "contrib/mips-msa/linux.c"
 #  endif
 #endif
 #ifdef PNG_MIPS_MSA_FILE
 #include <signal.h> /* for sig_atomic_t */
 static int png_have_msa(png_structp png_ptr);
 #include PNG_MIPS_MSA_FILE
 #else  /* PNG_MIPS_MSA_FILE */
 #  error "PNG_MIPS_MSA_FILE undefined: no support for run-time MIPS MSA checks"
 #endif /* PNG_MIPS_MSA_FILE */
 #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
 #ifndef PNG_ALIGNED_MEMORY_SUPPORTED
 #  error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED"
 #endif
 void
 png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
 {
   /* The switch statement is compiled in for MIPS_MSA_API, the call to
    * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined
    * the check is only performed if the API has not set the MSA option on
    * or off explicitly. In this case the check controls what happens.
    */
 #ifdef PNG_MIPS_MSA_API_SUPPORTED
   switch ((pp->options >> PNG_MIPS_MSA) & 3)
   {
      case PNG_OPTION_UNSET:
         /* Allow the run-time check to execute if it has been enabled -
          * thus both API and CHECK can be turned on.  If it isn't supported
          * this case will fall through to the 'default' below, which just
          * returns.
          */
 #ifdef PNG_MIPS_MSA_CHECK_SUPPORTED
         {
            static volatile sig_atomic_t no_msa = -1; /* not checked */
            if (no_msa < 0)
               no_msa = !png_have_msa(pp);
            if (no_msa)
               return;
         }
 #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
         break;
      default: /* OFF or INVALID */
         return;
      case PNG_OPTION_ON:
         /* Option turned on */
         break;
   }
   /* IMPORTANT: any new external functions used here must be declared using
    * PNG_INTERNAL_FUNCTION in ../pngpriv.h.  This is required so that the
    * 'prefix' option to configure works:
    *
    *    ./configure --with-libpng-prefix=foobar_
    *
    * Verify you have got this right by running the above command, doing a build
    * and examining pngprefix.h; it must contain a #define for every external
    * function you add.  (Notice that this happens automatically for the
    * initialization function.)
    */
   pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_msa;
   if (bpp == 3)
   {
      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_msa;
      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa;
      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa;
   }
   else if (bpp == 4)
   {
      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa;
      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
   }
 #else
   (void)pp;
   (void)bpp;
 #endif /* PNG_MIPS_MSA_API_SUPPORTED */
 }
 #endif /* PNG_MIPS_MSA_OPT > 0 */
 #endif /* READ */
--- a/3rdparty/libpng/patches/20190910-msa-patch.diff
+++ b/3rdparty/libpng/patches/20190910-msa-patch.diff
@ -0,0 +1,53 @@
 diff --git a/3rdparty/libpng/mips/mips_init.c b/3rdparty/libpng/mips/mips_init.c
 index 8dd283deef..6a061cccfa 100644
 --- a/3rdparty/libpng/mips/mips_init.c
 +++ b/3rdparty/libpng/mips/mips_init.c
@@ -73,7 +73,6 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
           * this case will fall through to the 'default' below, which just
           * returns.
           */
 -#endif /* PNG_MIPS_MSA_API_SUPPORTED */
 #ifdef PNG_MIPS_MSA_CHECK_SUPPORTED
          {
             static volatile sig_atomic_t no_msa = -1; /* not checked */
@@ -84,12 +83,9 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
             if (no_msa)
                return;
          }
 -#ifdef PNG_MIPS_MSA_API_SUPPORTED
 -         break;
 -#endif
 #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
 +         break;
 -#ifdef PNG_MIPS_MSA_API_SUPPORTED
       default: /* OFF or INVALID */
          return;
@@ -97,8 +93,6 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
          /* Option turned on */
          break;
    }
 -#endif
 -
    /* IMPORTANT: any new external functions used here must be declared using
     * PNG_INTERNAL_FUNCTION in ../pngpriv.h.  This is required so that the
     * 'prefix' option to configure works:
@@ -118,13 +112,16 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa;
       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa;
    }
 -
    else if (bpp == 4)
    {
       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa;
       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
    }
 +#else
 +   (void)pp;
 +   (void)bpp;
 +#endif /* PNG_MIPS_MSA_API_SUPPORTED */
 }
 #endif /* PNG_MIPS_MSA_OPT > 0 */
 #endif /* READ */
--- a/3rdparty/libwebp/patches/20190910-msa-asm-patch.diff
+++ b/3rdparty/libwebp/patches/20190910-msa-asm-patch.diff
@ -0,0 +1,22 @@
 diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h
 index de026a1d9e..a16c0bb300 100644
 --- a/3rdparty/libwebp/src/dsp/msa_macro.h
 +++ b/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -73,7 +73,7 @@
   static inline TYPE FUNC_NAME(const void* const psrc) {  \
     const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
     TYPE val_m;                                           \
 -    asm volatile (                                        \
 +    __asm__ volatile (                                        \
       "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
       : [val_m] "=r" (val_m)                              \
       : [psrc_m] "m" (*psrc_m));                          \
@@ -86,7 +86,7 @@
   static inline void FUNC_NAME(TYPE val, void* const pdst) { \
     uint8_t* const pdst_m = (uint8_t*)pdst;                  \
     TYPE val_m = val;                                        \
 -    asm volatile (                                           \
 +    __asm__ volatile (                                           \
       " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
       : [pdst_m] "=m" (*pdst_m)                              \
       : [val_m] "r" (val_m));                                \
--- a/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/3rdparty/libwebp/src/dsp/msa_macro.h
@ -73,7 +73,7 @@
  static inline TYPE FUNC_NAME(const void* const psrc) {  \
    const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
    TYPE val_m;                                           \
-    asm volatile (                                        \
+    __asm__ volatile (                                        \
      "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
      : [val_m] "=r" (val_m)                              \
      : [psrc_m] "m" (*psrc_m));                          \
@ -86,7 +86,7 @@
  static inline void FUNC_NAME(TYPE val, void* const pdst) { \
    uint8_t* const pdst_m = (uint8_t*)pdst;                  \
    TYPE val_m = val;                                        \
-    asm volatile (                                           \
+    __asm__ volatile (                                           \
      " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
      : [pdst_m] "=m" (*pdst_m)                              \
      : [val_m] "r" (val_m));                                \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1519,7 +1519,7 @@ if(FLAKE8_FOUND AND FLAKE8_EXECUTABLE)
 endif()
 # ========================== java ==========================
-if(BUILD_JAVA OR BUILD_opencv_java)
+if(BUILD_JAVA)
  status("")
  status("  Java:"            BUILD_FAT_JAVA_LIB  THEN "export all functions"                                      ELSE "")
  status("    ant:"           ANT_EXECUTABLE      THEN "${ANT_EXECUTABLE} (ver ${ANT_VERSION})"                    ELSE NO)
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -45,6 +45,7 @@
 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
 list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CEL;AVX512_ICL")
 list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@ -339,6 +340,11 @@ elseif(ARM OR AARCH64)
    ocv_update(CPU_FP16_IMPLIES "NEON")
    set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
  endif()
 elseif(MIPS)
  ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA")
  ocv_update(CPU_MSA_FLAGS_ON "-mmsa")
  set(CPU_BASELINE "MSA" CACHE STRING "${HELP_CPU_BASELINE}")
 elseif(PPC64LE)
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
  ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le")
  set(PPC64LE 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
  set(PPC64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
  set(MIPS 1)
 endif()
 # Workaround for 32-bit operating systems on x86_64/aarch64 processor
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@ -122,8 +122,10 @@ endif()
 #  Part 3/3: ${BIN_DIR}/win-install/OpenCVConfig.cmake  -> For use within binary installers/packages
 # --------------------------------------------------------------------------------------------
 if(WIN32)
-  if(CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
+  if(CMAKE_HOST_SYSTEM_NAME MATCHES Windows AND NOT OPENCV_SKIP_CMAKE_ROOT_CONFIG)
-    ocv_gen_config("${CMAKE_BINARY_DIR}/win-install" "${OPENCV_LIB_INSTALL_PATH}" "OpenCVConfig.root-WIN32.cmake.in")
+    ocv_gen_config("${CMAKE_BINARY_DIR}/win-install"
                   "${OPENCV_INSTALL_BINARIES_PREFIX}${OPENCV_INSTALL_BINARIES_SUFFIX}"
                   "OpenCVConfig.root-WIN32.cmake.in")
  else()
    ocv_gen_config("${CMAKE_BINARY_DIR}/win-install" "" "")
  endif()
--- a/cmake/OpenCVInstallLayout.cmake
+++ b/cmake/OpenCVInstallLayout.cmake
@ -23,15 +23,15 @@ if(ANDROID)
 elseif(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  if(DEFINED OpenCV_RUNTIME AND DEFINED OpenCV_ARCH)
-    set(_prefix "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
+    ocv_update(OPENCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
  else()
    message(STATUS "Can't detect runtime and/or arch")
-    set(_prefix "")
+    ocv_update(OPENCV_INSTALL_BINARIES_PREFIX "")
  endif()
  if(OpenCV_STATIC)
-    set(_suffix "staticlib")
+    ocv_update(OPENCV_INSTALL_BINARIES_SUFFIX "staticlib")
  else()
-    set(_suffix "lib")
+    ocv_update(OPENCV_INSTALL_BINARIES_SUFFIX "lib")
  endif()
  if(INSTALL_CREATE_DISTRIB)
    set(_jni_suffix "/${OpenCV_ARCH}")
@ -39,12 +39,12 @@ elseif(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
    set(_jni_suffix "")
  endif()
-  ocv_update(OPENCV_BIN_INSTALL_PATH           "${_prefix}bin")
+  ocv_update(OPENCV_BIN_INSTALL_PATH           "${OPENCV_INSTALL_BINARIES_PREFIX}bin")
  ocv_update(OPENCV_TEST_INSTALL_PATH          "${OPENCV_BIN_INSTALL_PATH}")
-  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH   "${_prefix}samples")
+  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH   "${OPENCV_INSTALL_BINARIES_PREFIX}samples")
-  ocv_update(OPENCV_LIB_INSTALL_PATH           "${_prefix}${_suffix}")
+  ocv_update(OPENCV_LIB_INSTALL_PATH           "${OPENCV_INSTALL_BINARIES_PREFIX}${OPENCV_INSTALL_BINARIES_SUFFIX}")
  ocv_update(OPENCV_LIB_ARCHIVE_INSTALL_PATH   "${OPENCV_LIB_INSTALL_PATH}")
-  ocv_update(OPENCV_3P_LIB_INSTALL_PATH        "${OPENCV_LIB_INSTALL_PATH}")
+  ocv_update(OPENCV_3P_LIB_INSTALL_PATH        "${OPENCV_INSTALL_BINARIES_PREFIX}staticlib")
  ocv_update(OPENCV_CONFIG_INSTALL_PATH        ".")
  ocv_update(OPENCV_INCLUDE_INSTALL_PATH       "include")
  ocv_update(OPENCV_OTHER_INSTALL_PATH         "etc")
--- a/cmake/checks/atomic_check.cpp
+++ b/cmake/checks/atomic_check.cpp
@ -2,7 +2,7 @@
 static int test()
 {
-    std::atomic<int> x;
+    std::atomic<long long> x;
    return x;
 }
--- a/cmake/checks/cpu_msa.cpp
+++ b/cmake/checks/cpu_msa.cpp
@ -0,0 +1,23 @@
 #include <stdio.h>
 #if defined(__mips_msa)
 #  include <msa.h>
 #  define CV_MSA 1
 #endif
 #if defined CV_MSA
 int test()
 {
    const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
    v4f32 val = (v4f32)__msa_ld_w((const float*)(src), 0);
    return __msa_copy_s_w(__builtin_msa_ftint_s_w (val), 0);
 }
 #else
 #error "MSA is not supported"
 #endif
 int main()
 {
  printf("%d\n", test());
  return 0;
 }
--- a/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown
+++ b/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown
@ -4,7 +4,7 @@ Getting Started with Images {#tutorial_py_image_display}
 Goals
 -----
-   Here, you will learn how to read an image, how to display it and how to save it back
+-   Here, you will learn how to read an image, how to display it, and how to save it back
 -   You will learn these functions : **cv.imread()**, **cv.imshow()** , **cv.imwrite()**
 -   Optionally, you will learn how to display images with Matplotlib
@ -30,7 +30,7 @@ See the code below:
 import numpy as np
 import cv2 as cv
-# Load an color image in grayscale
+# Load a color image in grayscale
 img = cv.imread('messi5.jpg',0)
@endcode
@ -43,7 +43,7 @@ Even if the image path is wrong, it won't throw any error, but `print img` will
 Use the function **cv.imshow()** to display an image in a window. The window automatically fits to
 the image size.
-First argument is a window name which is a string. second argument is our image. You can create as
+First argument is a window name which is a string. Second argument is our image. You can create as
 many windows as you wish, but with different window names.
@code{.py}
 cv.imshow('image',img)
@ -66,11 +66,11 @@ MUST use it to actually display the image.
 specific window, use the function **cv.destroyWindow()** where you pass the exact window name as
 the argument.
-@note There is a special case where you can already create a window and load image to it later. In
+@note There is a special case where you can create an empty window and load an image to it later. In
-that case, you can specify whether window is resizable or not. It is done with the function
+that case, you can specify whether the window is resizable or not. It is done with the function
-**cv.namedWindow()**. By default, the flag is cv.WINDOW_AUTOSIZE. But if you specify flag to be
+**cv.namedWindow()**. By default, the flag is cv.WINDOW_AUTOSIZE. But if you specify the flag to be
-cv.WINDOW_NORMAL, you can resize window. It will be helpful when image is too large in dimension
+cv.WINDOW_NORMAL, you can resize window. It will be helpful when an image is too large in dimension
-and adding track bar to windows.
+and when adding track bars to windows.
 See the code below:
@code{.py}
@ -91,8 +91,8 @@ This will save the image in PNG format in the working directory.
 ### Sum it up
-Below program loads an image in grayscale, displays it, save the image if you press 's' and exit, or
+Below program loads an image in grayscale, displays it, saves the image if you press 's' and exit, or
-simply exit without saving if you press ESC key.
+simply exits without saving if you press ESC key.
@code{.py}
 import numpy as np
 import cv2 as cv
@ -117,7 +117,7 @@ Using Matplotlib
 Matplotlib is a plotting library for Python which gives you wide variety of plotting methods. You
 will see them in coming articles. Here, you will learn how to display image with Matplotlib. You can
-zoom images, save it etc using Matplotlib.
+zoom images, save them, etc, using Matplotlib.
@code{.py}
 import numpy as np
 import cv2 as cv
--- a/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown
+++ b/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown
@ -213,7 +213,7 @@ Supported platform: Drive PX 2
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_python2=ON \
        -DBUILD_opencv_python3=OFF \
        -DENABLE_NEON=ON \
@ -263,7 +263,7 @@ Configuration is slightly different for the Jetson TK1 and the Jetson TX1 system
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_python2=ON \
        -DBUILD_opencv_python3=OFF \
        -DENABLE_NEON=ON \
@ -300,7 +300,7 @@ __Note:__ This uses CUDA 6.5, not 8.0.
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_python2=ON \
        -DBUILD_opencv_python3=OFF \
        -DENABLE_PRECOMPILED_HEADERS=OFF \
@ -345,7 +345,7 @@ The configuration options given to `cmake` below are targeted towards the functi
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_python2=ON \
        -DBUILD_opencv_python3=OFF \
        -DWITH_OPENCL=OFF \
@ -476,7 +476,7 @@ For DRIVE PX 2:
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_nonfree=OFF \
        -DBUILD_opencv_python=ON \
        -DENABLE_NEON=ON \
@ -513,7 +513,7 @@ For Jetson TK1:
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_nonfree=OFF \
        -DBUILD_opencv_python=ON \
        -DENABLE_NEON=ON \
@ -548,7 +548,7 @@ For Jetson TX1:
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_nonfree=OFF \
        -DBUILD_opencv_python=ON \
        -DENABLE_PRECOMPILED_HEADERS=OFF \
@ -585,7 +585,7 @@ For both 14.04 LTS and 16.04 LTS:
        -DBUILD_JASPER=OFF \
        -DBUILD_ZLIB=OFF \
        -DBUILD_EXAMPLES=ON \
-        -DBUILD_opencv_java=OFF \
+        -DBUILD_JAVA=OFF \
        -DBUILD_opencv_nonfree=OFF \
        -DBUILD_opencv_python=ON \
        -DWITH_OPENCL=OFF \
@ -626,7 +626,7 @@ The following is a table of all the parameters passed to CMake in the recommende
 |BUILD_TBB|OFF|As above, for `tbb`| |
 |BUILD_TIFF|OFF|As above, for `libtiff`| |
 |BUILD_ZLIB|OFF|As above, for `zlib`| |
-|BUILD_opencv_java|OFF|Controls the building of the Java bindings for OpenCV|Building the Java bindings requires OpenCV libraries be built for static linking only|
+|BUILD_JAVA|OFF|Controls the building of the Java bindings for OpenCV|Building the Java bindings requires OpenCV libraries be built for static linking only|
 |BUILD_opencv_nonfree|OFF|Controls the building of non-free (non-open-source) elements|Used only for building 2.4.X|
 |BUILD_opencv_python|ON|Controls the building of the Python 2 bindings in OpenCV 2.4.X|Used only for building 2.4.X|
 |BUILD_opencv_python2|ON|Controls the building of the Python 2 bindings in OpenCV 3.1.0|Not used in 2.4.X|
--- a/modules/calib3d/perf/perf_stereosgbm.cpp
+++ b/modules/calib3d/perf/perf_stereosgbm.cpp
@ -41,20 +41,18 @@ namespace opencv_test
 using namespace perf;
 using namespace testing;
-static void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_view);
+static void MakeArtificialExample(Mat& dst_left_view, Mat& dst_view);
 CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4);
 typedef tuple<Size, int, SGBMModes> SGBMParams;
-typedef TestBaseWithParam<SGBMParams> TestStereoCorresp;
+typedef TestBaseWithParam<SGBMParams> TestStereoCorrespSGBM;
 #ifndef _DEBUG
-PERF_TEST_P( TestStereoCorresp, SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) )
+PERF_TEST_P( TestStereoCorrespSGBM, SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) )
 #else
-PERF_TEST_P( TestStereoCorresp, DISABLED_TooLongInDebug_SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) )
+PERF_TEST_P( TestStereoCorrespSGBM, DISABLED_TooLongInDebug_SGBM, Combine(Values(Size(1280,720),Size(640,480)), Values(256,128), SGBMModes::all()) )
 #endif
 {
    RNG rng(0);
    SGBMParams params = GetParam();
    Size sz              = get<0>(params);
@ -65,7 +63,7 @@ PERF_TEST_P( TestStereoCorresp, DISABLED_TooLongInDebug_SGBM, Combine(Values(Siz
    Mat src_right(sz, CV_8UC3);
    Mat dst(sz, CV_16S);
-    MakeArtificialExample(rng,src_left,src_right);
+    MakeArtificialExample(src_left,src_right);
    int wsize = 3;
    int P1 = 8*src_left.channels()*wsize*wsize;
@ -78,8 +76,34 @@ PERF_TEST_P( TestStereoCorresp, DISABLED_TooLongInDebug_SGBM, Combine(Values(Siz
    SANITY_CHECK(dst, .01, ERROR_RELATIVE);
 }
-void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_right_view)
+typedef tuple<Size, int> BMParams;
 typedef TestBaseWithParam<BMParams> TestStereoCorrespBM;
 PERF_TEST_P(TestStereoCorrespBM, BM, Combine(Values(Size(1280, 720), Size(640, 480)), Values(256, 128)))
 {
    BMParams params = GetParam();
    Size sz = get<0>(params);
    int num_disparities = get<1>(params);
    Mat src_left(sz, CV_8UC1);
    Mat src_right(sz, CV_8UC1);
    Mat dst(sz, CV_16S);
    MakeArtificialExample(src_left, src_right);
    int wsize = 21;
    TEST_CYCLE()
    {
        Ptr<StereoBM> bm = StereoBM::create(num_disparities, wsize);
        bm->compute(src_left, src_right, dst);
    }
    SANITY_CHECK(dst, .01, ERROR_RELATIVE);
 }
 void MakeArtificialExample(Mat& dst_left_view, Mat& dst_right_view)
 {
    RNG rng(0);
    int w = dst_left_view.cols;
    int h = dst_left_view.rows;
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@ -571,7 +571,8 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2,
    if (!_mask.empty())
    {
        Mat mask = _mask.getMat();
-        CV_Assert(mask.size() == mask1.size());
+        CV_Assert(npoints == mask.checkVector(1));
        mask = mask.reshape(1, npoints);
        bitwise_and(mask, mask1, mask1);
        bitwise_and(mask, mask2, mask2);
        bitwise_and(mask, mask3, mask3);
--- a/modules/calib3d/src/undistort.simd.hpp
+++ b/modules/calib3d/src/undistort.simd.hpp
@ -153,19 +153,24 @@ public:
                m_2 = vx_setall_f64(k5);
                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
                m_3 = vx_setall_f64(2.0);
                xd_0 = v_muladd(m_3, xd_0, r2_0);
                yd_0 = v_muladd(m_3, yd_0, r2_0);
                xd_1 = v_muladd(m_3, xd_1, r2_1);
                yd_1 = v_muladd(m_3, yd_1, r2_1);
                m_2 = x_0 * y_0 * m_3;
                m_3 = x_1 * y_1 * m_3;
                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
                m_0 = vx_setall_f64(p1);
                m_1 = vx_setall_f64(p2);
-                m_2 = vx_setall_f64(2.0);
+                xd_0 = v_muladd(xd_0, m_1, x_0);
-                xd_0 = v_muladd(v_muladd(m_2, xd_0, r2_0), m_1, x_0);
+                yd_0 = v_muladd(yd_0, m_0, y_0);
-                yd_0 = v_muladd(v_muladd(m_2, yd_0, r2_0), m_0, y_0);
+                xd_1 = v_muladd(xd_1, m_1, x_1);
-                xd_1 = v_muladd(v_muladd(m_2, xd_1, r2_1), m_1, x_1);
+                yd_1 = v_muladd(yd_1, m_0, y_1);
                yd_1 = v_muladd(v_muladd(m_2, yd_1, r2_1), m_0, y_1);
                m_0 *= m_2; m_1 *= m_2;
                m_2 = x_0 * y_0;
                m_3 = x_1 * y_1;
                xd_0 = v_muladd(m_0, m_2, xd_0);
                yd_0 = v_muladd(m_1, m_2, yd_0);
                xd_1 = v_muladd(m_0, m_3, xd_1);
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@ -2196,4 +2196,102 @@ TEST(Calib3d_Triangulate, accuracy)
    }
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 TEST(CV_RecoverPoseTest, regression_15341)
 {
    // initialize test data
    const int invalid_point_count = 2;
    const float _points1_[] = {
        1537.7f, 166.8f,
        1599.1f, 179.6f,
        1288.0f, 207.5f,
        1507.1f, 193.2f,
        1742.7f, 210.0f,
        1041.6f, 271.7f,
        1591.8f, 247.2f,
        1524.0f, 261.3f,
        1330.3f, 285.0f,
        1403.1f, 284.0f,
        1506.6f, 342.9f,
        1502.8f, 347.3f,
        1344.9f, 364.9f,
        0.0f, 0.0f  // last point is initial invalid
    };
    const float _points2_[] = {
        1533.4f, 532.9f,
        1596.6f, 552.4f,
        1277.0f, 556.4f,
        1502.1f, 557.6f,
        1744.4f, 601.3f,
        1023.0f, 612.6f,
        1589.2f, 621.6f,
        1519.4f, 629.0f,
        1320.3f, 637.3f,
        1395.2f, 642.2f,
        1501.5f, 710.3f,
        1497.6f, 714.2f,
        1335.1f, 719.61f,
        1000.0f, 1000.0f  // last point is initial invalid
    };
    vector<Point2f> _points1; Mat(14, 1, CV_32FC2, (void*)_points1_).copyTo(_points1);
    vector<Point2f> _points2; Mat(14, 1, CV_32FC2, (void*)_points2_).copyTo(_points2);
    const int point_count = (int) _points1.size();
    CV_Assert(point_count == (int) _points2.size());
    // camera matrix with both focal lengths = 1, and principal point = (0, 0)
    const Mat cameraMatrix = Mat::eye(3, 3, CV_64F);
    int Inliers = 0;
    const int ntests = 3;
    for (int testcase = 1; testcase <= ntests; ++testcase)
    {
        if (testcase == 1) // testcase with vector input data
        {
            // init temporary test data
            vector<unsigned char> mask(point_count);
            vector<Point2f> points1(_points1);
            vector<Point2f> points2(_points2);
            // Estimation of fundamental matrix using the RANSAC algorithm
            Mat E, R, t;
            E = findEssentialMat(points1, points2, cameraMatrix, RANSAC, 0.999, 1.0, mask);
            EXPECT_EQ(0, (int)mask[13]) << "Detecting outliers in function findEssentialMat failed, testcase " << testcase;
            points2[12] = Point2f(0.0f, 0.0f); // provoke another outlier detection for recover Pose
            Inliers = recoverPose(E, points1, points2, cameraMatrix, R, t, mask);
            EXPECT_EQ(0, (int)mask[12]) << "Detecting outliers in function failed, testcase " << testcase;
        }
        else // testcase with mat input data
        {
            Mat points1(_points1, true);
            Mat points2(_points2, true);
            Mat mask;
            if (testcase == 2)
            {
                // init temporary testdata
                mask = Mat::zeros(point_count, 1, CV_8UC1);
            }
            else // testcase == 3 - with transposed mask
            {
                mask = Mat::zeros(1, point_count, CV_8UC1);
            }
            // Estimation of fundamental matrix using the RANSAC algorithm
            Mat E, R, t;
            E = findEssentialMat(points1, points2, cameraMatrix, RANSAC, 0.999, 1.0, mask);
            EXPECT_EQ(0, (int)mask.at<unsigned char>(13)) << "Detecting outliers in function findEssentialMat failed, testcase " << testcase;
            points2.at<Point2f>(12) = Point2f(0.0f, 0.0f); // provoke an outlier detection
            Inliers = recoverPose(E, points1, points2, cameraMatrix, R, t, mask);
            EXPECT_EQ(0, (int)mask.at<unsigned char>(12)) << "Detecting outliers in function failed, testcase " << testcase;
        }
        EXPECT_EQ(Inliers, point_count - invalid_point_count) <<
            "Number of inliers differs from expected number of inliers, testcase " << testcase;
    }
 }
 }} // namespace
--- a/modules/calib3d/test/test_undistort.cpp
+++ b/modules/calib3d/test/test_undistort.cpp
@ -1469,6 +1469,44 @@ TEST(Calib3d_UndistortPoints, outputShape)
    }
 }
 TEST(Imgproc_undistort, regression_15286)
 {
    double kmat_data[9] = { 3217, 0, 1592, 0, 3217, 1201, 0, 0, 1 };
    Mat kmat(3, 3, CV_64F, kmat_data);
    double dist_coeff_data[5] = { 0.04, -0.4, -0.01, 0.04, 0.7 };
    Mat dist_coeffs(5, 1, CV_64F, dist_coeff_data);
    Mat img = Mat::zeros(512, 512, CV_8UC1);
    img.at<uchar>(128, 128) = 255;
    img.at<uchar>(128, 384) = 255;
    img.at<uchar>(384, 384) = 255;
    img.at<uchar>(384, 128) = 255;
    Mat ref = Mat::zeros(512, 512, CV_8UC1);
    ref.at<uchar>(Point(24, 98)) = 78;
    ref.at<uchar>(Point(24, 99)) = 114;
    ref.at<uchar>(Point(25, 98)) = 36;
    ref.at<uchar>(Point(25, 99)) = 60;
    ref.at<uchar>(Point(27, 361)) = 6;
    ref.at<uchar>(Point(28, 361)) = 188;
    ref.at<uchar>(Point(28, 362)) = 49;
    ref.at<uchar>(Point(29, 361)) = 44;
    ref.at<uchar>(Point(29, 362)) = 16;
    ref.at<uchar>(Point(317, 366)) = 134;
    ref.at<uchar>(Point(317, 367)) = 78;
    ref.at<uchar>(Point(318, 366)) = 40;
    ref.at<uchar>(Point(318, 367)) = 29;
    ref.at<uchar>(Point(310, 104)) = 106;
    ref.at<uchar>(Point(310, 105)) = 30;
    ref.at<uchar>(Point(311, 104)) = 112;
    ref.at<uchar>(Point(311, 105)) = 38;
    Mat img_undist;
    undistort(img, img_undist, kmat, dist_coeffs);
    ASSERT_EQ(0.0, cvtest::norm(img_undist, ref, cv::NORM_INF));
 }
 TEST(Calib3d_initUndistortRectifyMap, regression_14467)
 {
    Size size_w_h(512 + 3, 512);
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -152,6 +152,11 @@
 #  define CV_VSX3 1
 #endif
 #ifdef CV_CPU_COMPILE_MSA
 #  include "hal/msa_macros.h"
 #  define CV_MSA 1
 #endif
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@ -319,3 +324,7 @@ struct VZeroUpperGuard {
 #ifndef CV_VSX3
 #  define CV_VSX3 0
 #endif
 #ifndef CV_MSA
 #  define CV_MSA 0
 #endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -420,6 +420,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 1
 #  define CV_CPU_HAS_SUPPORT_MSA 1
 #  define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args)
 #  define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args)
 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 0
 #  define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA))
 #  define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
 #  define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
 #else
 #  define CV_TRY_MSA 0
 #  define CV_CPU_FORCE_MSA 0
 #  define CV_CPU_HAS_SUPPORT_MSA 0
 #  define CV_CPU_CALL_MSA(fn, args)
 #  define CV_CPU_CALL_MSA_(fn, args)
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...)  CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
 #  define CV_TRY_VSX 1
 #  define CV_CPU_FORCE_VSX 1
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -244,6 +244,8 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 #define CV_CPU_NEON             100
 #define CV_CPU_MSA              150
 #define CV_CPU_VSX              200
 #define CV_CPU_VSX3             201
@ -294,6 +296,8 @@ enum CpuFeatures {
    CPU_NEON            = 100,
    CPU_MSA             = 150,
    CPU_VSX             = 200,
    CPU_VSX3            = 201,
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -165,9 +165,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_NEON
 #   undef CV_VSX
 #   undef CV_FP16
 #   undef CV_MSA
 #endif
-#if CV_SSE2 || CV_NEON || CV_VSX
+#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@ -185,6 +186,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #include "opencv2/core/hal/intrin_vsx.hpp"
 #elif CV_MSA
 #include "opencv2/core/hal/intrin_msa.hpp"
 #else
 #define CV_SIMD128_CPP 1
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
--- a/modules/core/include/opencv2/core/hal/msa_macros.h
+++ b/modules/core/include/opencv2/core/hal/msa_macros.h
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@ -409,13 +409,13 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
        int x = 0;
    #if CV_SIMD
-        #if !CV_NEON
+        #if !CV_NEON && !CV_MSA
        if (is_aligned(src1, src2, dst))
        {
            for (; x <= width - wide_step_l; x += wide_step_l)
            {
                ldr::la(src1 + x, src2 + x, dst + x);
-                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                #if CV_SIMD_WIDTH == 16
                ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
                #endif
            }
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -2476,6 +2476,45 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
            i += blockSize;
        }
    }
 #elif CV_MSA
    int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
    v4i32 v_zero = msa_dupq_n_s32(0);
    CV_DECL_ALIGNED(16) int buf[4];
    while( i < len0 )
    {
        blockSize = std::min(len0 - i, blockSize0);
        v4i32 v_sum = v_zero;
        int j = 0;
        for( ; j <= blockSize - 16; j += 16 )
        {
            v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
            v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
            v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
            v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
        }
        for( ; j <= blockSize - 8; j += 8 )
        {
            v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
        }
        msa_st1q_s32(buf, v_sum);
        r += buf[0] + buf[1] + buf[2] + buf[3];
        src1 += blockSize;
        src2 += blockSize;
        i += blockSize;
    }
 #endif
    return r + dotProd_(src1, src2, len - i);
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@ -47,6 +47,8 @@ DECLARE_CV_PAUSE
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0)
 # elif defined __GNUC__ && defined __arm__
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0)
 # elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0)
 # elif defined __GNUC__ && defined __PPC64__
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0)
 # else
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -368,6 +368,8 @@ struct HWFeatures
        g_hwFeatureNames[CPU_VSX] = "VSX";
        g_hwFeatureNames[CPU_VSX3] = "VSX3";
        g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
        g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
        g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
        g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
@ -557,6 +559,9 @@ struct HWFeatures
    #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
        have[CV_CPU_NEON] = true;
    #endif
    #ifdef __mips_msa
        have[CV_CPU_MSA] = true;
    #endif
    // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
    have[CV_CPU_VSX] = (CV_VSX);
    // TODO: Check VSX3 availability in runtime for other platforms
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@ -339,10 +339,28 @@ int CvCaptureCAM::startCaptureDevice(int cameraNum) {
        }
        else if (status != AVAuthorizationStatusAuthorized)
        {
-            fprintf(stderr, "OpenCV: not authorized to capture video (status %ld), requesting...\n", status);
+            if (!cv::utils::getConfigurationParameterBool("OPENCV_AVFOUNDATION_SKIP_AUTH", false))
-            // TODO: doesn't work via ssh
+            {
-            [AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL) { /* we don't care */}];
+                fprintf(stderr, "OpenCV: not authorized to capture video (status %ld), requesting...\n", status);
-            // we do not wait for completion
+                [AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL) { /* we don't care */}];
                if ([NSThread isMainThread])
                {
                    // we run the main loop for 0.1 sec to show the message
                    [[NSRunLoop mainRunLoop] runUntilDate:[NSDate dateWithTimeIntervalSinceNow:0.1]];
                }
                else
                {
                    fprintf(stderr, "OpenCV: can not spin main run loop from other thread, set "
                                    "OPENCV_AVFOUNDATION_SKIP_AUTH=1 to disable authorization request "
                                    "and perform it in your application.\n");
                }
            }
            else
            {
                fprintf(stderr, "OpenCV: not authorized to capture video (status %ld), set "
                                "OPENCV_AVFOUNDATION_SKIP_AUTH=0 to enable authorization request or "
                                "perform it in your application.\n", status);
            }
            [localpool drain];
            return 0;
        }
--- a/platforms/linux/mips.toolchain.cmake
+++ b/platforms/linux/mips.toolchain.cmake
@ -0,0 +1,80 @@
 # ----------------------------------------------------------------------------------------------
 #  MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
 #  Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
 #  Toolchains with 'img' in the name are for MIPS R6 instruction sets.
 #  It is recommended to use cmake-gui application for build scripts configuration and generation:
 #  1. Run cmake-gui
 #  2. Specifiy toolchain file for cross-compiling, mips32r5el-gnu.toolchian.cmake or mips64r6el-gnu.toolchain.cmake
 #     can be selected.
 #  3. Configure and Generate makefiles.
 #  4. make -j4 & make install
 # ----------------------------------------------------------------------------------------------
 if(COMMAND toolchain_save_config)
  return() # prevent recursive call
 endif()
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_VERSION 1)
 if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
  set(CMAKE_SYSTEM_PROCESSOR mips)
 endif()
 include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake")
 if(CMAKE_SYSTEM_PROCESSOR STREQUAL mips AND NOT MIPS_IGNORE_FP)
  set(FLOAT_ABI_SUFFIX "")
 endif()
 if(NOT "x${GCC_COMPILER_VERSION}" STREQUAL "x")
  set(__GCC_VER_SUFFIX "-${GCC_COMPILER_VERSION}")
 endif()
 if(NOT DEFINED CMAKE_C_COMPILER)
  find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-gcc${__GCC_VER_SUFFIX})
 endif()
 if(NOT DEFINED CMAKE_CXX_COMPILER)
  find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-g++${__GCC_VER_SUFFIX})
 endif()
 if(NOT DEFINED CMAKE_LINKER)
  find_program(CMAKE_LINKER NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ld)
 endif()
 if(NOT DEFINED CMAKE_AR)
  find_program(CMAKE_AR NAMES ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar${__GCC_VER_SUFFIX} ${GNU_MACHINE}${FLOAT_ABI_SUFFIX}-ar)
 endif()
 if(NOT DEFINED MIPS_LINUX_SYSROOT AND DEFINED GNU_MACHINE)
  set(MIPS_LINUX_SYSROOT /usr/bin)
 endif()
 if(NOT DEFINED CMAKE_CXX_FLAGS)
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "mips32r5el")
    set(CMAKE_C_FLAGS             "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "")
    set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "")
    set(CMAKE_CXX_FLAGS           "-march=mips32r5 -EL -mmsa -mhard-float -mfp64 -mnan=2008 -mabs=2008 -O3 -ffp-contract=off -mtune=p5600" CACHE INTERNAL "")
    set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "")
    set(CMAKE_EXE_LINKER_FLAGS    "-lpthread -lrt -ldl -latomic" CACHE INTERNAL "Added for mips cross build error")
    set(CMAKE_CXX_FLAGS           "${CMAKE_CXX_FLAGS} -fdata-sections  -Wa,--noexecstack -fsigned-char -Wno-psabi")
    set(CMAKE_C_FLAGS             "${CMAKE_C_FLAGS} -fdata-sections  -Wa,--noexecstack -fsigned-char -Wno-psabi")
  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64r6el")
    set(CMAKE_C_FLAGS             "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "")
    set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "")
    set(CMAKE_CXX_FLAGS           "-O3 -march=i6500 -EL -mmsa -mabi=64 -mhard-float -mfp64 -mnan=2008" CACHE INTERNAL "")
    set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "")
    set(CMAKE_EXE_LINKER_FLAGS    "-lpthread -lrt -ldl" CACHE INTERNAL "Added for mips cross build error")
    set(CMAKE_CXX_FLAGS           "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
    set(CMAKE_C_FLAGS             "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi")
  endif()
  set(CMAKE_SHARED_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
  set(CMAKE_MODULE_LINKER_FLAGS "${MIPS_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
  set(CMAKE_EXE_LINKER_FLAGS    "${MIPS_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
 endif()
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${MIPS_LINUX_SYSROOT})
 set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS}
    MIPS_LINUX_SYSROOT
 )
 toolchain_save_config()
--- a/platforms/linux/mips32r5el-gnu.toolchain.cmake
+++ b/platforms/linux/mips32r5el-gnu.toolchain.cmake
@ -0,0 +1,14 @@
 # ----------------------------------------------------------------------------------------------
 #  MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
 #  Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
 #  Toolchains with 'img' in the name are for MIPS R6 instruction sets.
 #  It is recommended to use cmake-gui for build scripts configuration and generation:
 #  1. Run cmake-gui
 #  2. Specifiy toolchain file mips32r5el-gnu.toolchian.cmake for cross-compiling.
 #  3. Configure and Generate makefiles.
 #  4. make -j4 & make install
 # ----------------------------------------------------------------------------------------------
 set(CMAKE_SYSTEM_PROCESSOR mips32r5el)
 set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version")
 set(GNU_MACHINE "mips-mti-linux-gnu" CACHE STRING "GNU compiler triple")
 include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake")
--- a/platforms/linux/mips64r6el-gnu.toolchain.cmake
+++ b/platforms/linux/mips64r6el-gnu.toolchain.cmake
@ -0,0 +1,14 @@
 # ----------------------------------------------------------------------------------------------
 #  MIPS ToolChanin can be downloaded from https://www.mips.com/develop/tools/codescape-mips-sdk/ .
 #  Toolchains with 'mti' in the name (and install directory) are for MIPS R2-R5 instruction sets.
 #  Toolchains with 'img' in the name are for MIPS R6 instruction sets.
 #  It is recommended to use cmake-gui for build scripts configuration and generation:
 #  1. Run cmake-gui
 #  2. Specifiy toolchain file mips64r6el-gnu.toolchain.cmake for cross-compiling.
 #  3. Configure and Generate makefiles.
 #  4. make -j4 & make install
 # ----------------------------------------------------------------------------------------------
 set(CMAKE_SYSTEM_PROCESSOR mips64r6el)
 set(GCC_COMPILER_VERSION "" CACHE STRING "GCC Compiler version")
 set(GNU_MACHINE "mips-img-linux-gnu" CACHE STRING "GNU compiler triple")
 include("${CMAKE_CURRENT_LIST_DIR}/mips.toolchain.cmake")
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@ -347,7 +347,7 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                    int bottom = (int)data[i + 6];
                    int width  = right - left + 1;
                    int height = bottom - top + 1;
-                    if (width * height <= 1)
+                    if (width <= 2 || height <= 2)
                    {
                        left   = (int)(data[i + 3] * frame.cols);
                        top    = (int)(data[i + 4] * frame.rows);
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -126,7 +126,7 @@ def postprocess(frame, outs):
                    bottom = int(detection[6])
                    width = right - left + 1
                    height = bottom - top + 1
-                    if width * height <= 1:
+                    if width <= 2 or height <= 2:
                        left = int(detection[3] * frameWidth)
                        top = int(detection[4] * frameHeight)
                        right = int(detection[5] * frameWidth)