Add zlib-ng as an alternative zlib implementation

Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573
11 months ago · 0de26fd78e
parent e80b7940ef
commit 0de26fd78e
129 changed files with 31910 additions and 13 deletions
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@ -49,6 +49,14 @@ zlib                  General purpose LZ77 compression library
                      Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler.
                      See zlib home page http://www.zlib.net
                      for details and links to the source code
+
+zlib-ng               zlib data compression library for the next generation systems
+                      (C) 1995-2013 Jean-loup Gailly and Mark Adler
+                      See zlib-ng official GitHub repository
+                      https://github.com/zlib-ng/zlib-ng.git
+                      for details and links to source code
+
+                      WITH_ZLIB_NG CMake option must be ON to use zlib-ng as the zlib implementation.
 ------------------------------------------------------------------------------------
 jasper                JasPer is a collection of software
                      (i.e., a library and application programs) for the coding
--- a/3rdparty/zlib-ng/CMakeLists.txt
+++ b/3rdparty/zlib-ng/CMakeLists.txt
@ -0,0 +1,796 @@
+project(${ZLIB_LIBRARY} LANGUAGES C)
+
+if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES)
+  set(CMAKE_C_STANDARD 11)          # The C standard whose features are requested to build this target
+else()
+  set(CMAKE_C_STANDARD 99)
+endif()
+set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
+set(CMAKE_C_EXTENSIONS OFF)       # Boolean specifying whether compiler specific extensions are requested
+
+include(CheckTypeSize)
+include(CheckSymbolExists)
+include(CheckFunctionExists)
+include(CheckIncludeFile)
+include(CheckCSourceCompiles)
+include(CheckCSourceRuns)
+include(CheckCCompilerFlag)
+include(CMakeDependentOption)
+
+if(X86_64 OR X86)
+  set(BASEARCH_X86_FOUND TRUE)
+endif()
+if(AARCH64 OR ARM)
+  set(BASEARCH_ARM_FOUND TRUE)
+endif()
+if(PPC64LE OR PPC64)
+  set(BASEARCH_PPC_FOUND TRUE)
+endif()
+if(RISCV)
+  set(BASEARCH_RISCV_FOUND TRUE)
+endif()
+
+include(cmake/detect-intrinsics.cmake)
+include(cmake/fallback-macros.cmake)
+
+set(ZLIB_SYMBOL_PREFIX "")
+
+if(BASEARCH_X86_FOUND)
+  set(WITH_AVX2 ON)
+  set(WITH_AVX512 ON)
+  set(WITH_AVX512VNNI ON)
+  set(WITH_SSE2 ON)
+  set(WITH_SSSE3 ON)
+  set(WITH_SSE42 ON)
+  set(WITH_PCLMULQDQ ON)
+  set(WITH_VPCLMULQDQ ON)
+endif()
+if(BASEARCH_ARM_FOUND)
+  set(WITH_ACLE ON)
+  set(WITH_NEON ON)
+  if(ARM)
+    set(WITH_ARMV6 ON)
+  else()
+    set(WITH_ARMV6 OFF)
+  endif()
+endif()
+if(BASEARCH_PPC_FOUND)
+  set(WITH_ALTIVEC ON)
+  set(WITH_POWER8 ON)
+  set(WITH_POWER9 ON)
+endif()
+if(BASEARCH_RISCV_FOUND)
+  set(WITH_RVV ON)
+endif()
+
+
+add_definitions(-DZLIB_COMPAT)
+
+add_definitions(-DWITH_GZFILEOP)
+
+if(CMAKE_C_COMPILER_ID MATCHES "^Intel")
+  set(WARNFLAGS_DISABLE)
+elseif(MSVC)
+  # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
+  # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+  if(MSVC_VERSION VERSION_LESS 1800)
+    message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
+  endif()
+  # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
+  # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
+  # avoid mistakes.
+  # /Oi ?
+  set(WARNFLAGS_DISABLE)
+  if(BASEARCH_ARM_FOUND)
+      add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
+      if(NOT "${ARCH}" MATCHES "aarch64")
+          set(NEONFLAG "/arch:VFPv4")
+      endif()
+  endif()
+elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+  set(WARNFLAGS_DISABLE)
+  # Check whether -fno-lto is available
+  set(CMAKE_REQUIRED_FLAGS "-fno-lto")
+  check_c_source_compiles(
+    "int main() { return 0; }"
+    FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
+  set(CMAKE_REQUIRED_FLAGS)
+  if(FNO_LTO_AVAILABLE)
+    set(ZNOLTOFLAG "-fno-lto")
+  endif()
+  if(BASEARCH_ARM_FOUND)
+    if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
+      # Auto-detect support for ARM floating point ABI
+      check_include_file(features.h HAVE_FEATURES_H)
+      if(HAVE_FEATURES_H)
+        set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
+        check_c_source_compiles(
+          "#include <features.h>
+          int main() { return 0; }"
+          HAVE_FLOATABI_SOFTFP)
+        if(HAVE_FLOATABI_SOFTFP)
+          set(FLOATABI -mfloat-abi=softfp)
+        else()
+          set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
+          check_c_source_compiles(
+            "#include <features.h>
+            int main() { return 0; }"
+            HAVE_FLOATABI_HARD)
+          if(HAVE_FLOATABI_HARD)
+            set(FLOATABI -mfloat-abi=hard)
+          endif()
+        endif()
+        set(CMAKE_REQUIRED_FLAGS)
+      endif()
+      if(FLOATABI)
+        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}")
+        add_compile_options(${FLOATABI})
+      else()
+        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected")
+      endif()
+    endif()
+  endif()
+  if(FNO_LTO_AVAILABLE)
+    set(NOLTOFLAG ${ZNOLTOFLAG})
+  endif()
+  if(MINGW)
+    # Add `-Wno-pedantic-ms-format` only if the toolchain supports it
+    check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
+    if(HAVE_NO_PEDANTIC_MS_FORMAT)
+      list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
+    endif()
+  endif()
+endif()
+
+# Force disable LTO
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+
+# Apply warning compiler flags
+add_compile_options(${WARNFLAGS_DISABLE})
+
+# Replace optimization level 3 added by default with level 2
+if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
+  string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
+    CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+endif()
+
+#
+# Check for standard/system includes
+#
+check_include_file(arm_acle.h  HAVE_ARM_ACLE_H)
+if(HAVE_ARM_ACLE_H)
+  add_definitions(-DHAVE_ARM_ACLE_H)
+endif()
+check_include_file(sys/auxv.h  HAVE_SYS_AUXV_H)
+if(HAVE_SYS_AUXV_H)
+  add_definitions(-DHAVE_SYS_AUXV_H)
+endif()
+check_include_file(sys/sdt.h   HAVE_SYS_SDT_H)
+if(HAVE_SYS_SDT_H)
+  add_definitions(-DHAVE_SYS_SDT_H)
+endif()
+check_include_file(unistd.h    HAVE_UNISTD_H)
+
+#
+# Check to see if we have large file support
+#
+set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+check_type_size(off64_t OFF64_T)
+if(HAVE_OFF64_T)
+  add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+else()
+  check_type_size(_off64_t _OFF64_T)
+  if(HAVE__OFF64_T)
+    add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+  else()
+    check_type_size(__off64_t __OFF64_T)
+  endif()
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
+
+#
+# Check for fseeko and other optional functions
+#
+check_function_exists(fseeko HAVE_FSEEKO)
+if(NOT HAVE_FSEEKO)
+  add_definitions(-DNO_FSEEKO)
+endif()
+
+check_function_exists(strerror HAVE_STRERROR)
+if(NOT HAVE_STRERROR)
+  add_definitions(-DNO_STRERROR)
+endif()
+
+set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L)
+check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
+if(HAVE_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS)
+
+set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1)
+check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC)
+if(HAVE_ALIGNED_ALLOC)
+  add_definitions(-DHAVE_ALIGNED_ALLOC)
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS)
+
+#
+# Check if we can hide zlib internal symbols that are linked between separate source files using hidden
+#
+check_c_source_compiles(
+  "#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
+  int Z_INTERNAL foo;
+  int main() {
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
+if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
+  add_definitions(-DHAVE_VISIBILITY_HIDDEN)
+endif()
+
+#
+# Check if we can hide zlib internal symbols that are linked between separate source files using internal
+#
+check_c_source_compiles(
+  "#define Z_INTERNAL __attribute__((visibility (\"internal\")))
+  int Z_INTERNAL foo;
+  int main() {
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
+if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
+  add_definitions(-DHAVE_VISIBILITY_INTERNAL)
+endif()
+
+#
+# Check for __attribute__((aligned(x))) support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      __attribute__((aligned(8))) int test = 0;
+      (void)test;
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
+if(HAVE_ATTRIBUTE_ALIGNED)
+  add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
+endif()
+
+#
+# check for __builtin_ctz() support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      unsigned int zero = 0;
+      long test = __builtin_ctz(zero);
+      (void)test;
+      return 0;
+  }"
+  HAVE_BUILTIN_CTZ
+)
+if(HAVE_BUILTIN_CTZ)
+  add_definitions(-DHAVE_BUILTIN_CTZ)
+endif()
+
+#
+# check for __builtin_ctzll() support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      unsigned int zero = 0;
+      long test = __builtin_ctzll(zero);
+      (void)test;
+      return 0;
+  }"
+  HAVE_BUILTIN_CTZLL
+)
+if(HAVE_BUILTIN_CTZLL)
+  add_definitions(-DHAVE_BUILTIN_CTZLL)
+endif()
+
+#
+# check for ptrdiff_t support
+#
+check_c_source_compiles(
+  "#include <stddef.h>
+    int main() {
+        ptrdiff_t *a;
+        (void)a;
+        return 0;
+  }"
+  HAVE_PTRDIFF_T
+)
+if(NOT HAVE_PTRDIFF_T)
+  set(NEED_PTRDIFF_T 1)
+
+  check_type_size("void *" SIZEOF_DATA_PTR)
+  message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
+
+  if(${SIZEOF_DATA_PTR} MATCHES "4")
+    set(PTRDIFF_TYPE "uint32_t")
+  elseif(${SIZEOF_DATA_PTR} MATCHES "8")
+    set(PTRDIFF_TYPE "uint64_t")
+  else()
+    message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
+  endif()
+endif()
+
+if(MSVC)
+  add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
+  add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
+endif()
+
+set(ZLIB_ARCH_SRCS)
+set(ZLIB_ARCH_HDRS)
+set(ARCHDIR "arch/generic")
+if(BASEARCH_X86_FOUND)
+  set(ARCHDIR "arch/x86")
+endif()
+if(BASEARCH_ARM_FOUND)
+  set(ARCHDIR "arch/arm")
+endif()
+if(BASEARCH_PPC_FOUND)
+  set(ARCHDIR "arch/power")
+endif()
+if(BASEARCH_RISCV_FOUND)
+  set(ARCHDIR "arch/riscv")
+endif()
+
+if(NOT CV_DISABLE_OPTIMIZATION)
+  if(BASEARCH_ARM_FOUND)
+    add_definitions(-DARM_FEATURES)
+    if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      if("${ARCH}" MATCHES "aarch64")
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+              return (getauxval(AT_HWCAP) & HWCAP_CRC32);
+          }"
+          ARM_AUXV_HAS_CRC32
+        )
+        if(ARM_AUXV_HAS_CRC32)
+          add_definitions(-DARM_AUXV_HAS_CRC32)
+        else()
+          message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+        endif()
+      else()
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+              return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+          }"
+          ARM_AUXV_HAS_CRC32
+        )
+        if(ARM_AUXV_HAS_CRC32)
+          add_definitions(-DARM_AUXV_HAS_CRC32)
+        else()
+          check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <asm/hwcap.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+            }"
+            ARM_HWCAP_HAS_CRC32
+          )
+          if(ARM_HWCAP_HAS_CRC32)
+            add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
+          else()
+            message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+          endif()
+        endif()
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+            return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
+          }"
+          ARM_AUXV_HAS_NEON
+        )
+        if(ARM_AUXV_HAS_NEON)
+          add_definitions(-DARM_AUXV_HAS_NEON)
+        else()
+          check_c_source_compiles(
+            "#include <sys/auxv.h>
+            int main() {
+              return (getauxval(AT_HWCAP) & HWCAP_NEON);
+            }"
+            ARM_AUXV_HAS_NEON
+          )
+          if (ARM_AUXV_HAS_NEON)
+            add_definitions(-DARM_AUXV_HAS_NEON)
+          else()
+            message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
+          endif()
+        endif()
+      endif()
+    endif()
+    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
+    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
+    if(WITH_ACLE)
+      check_acle_compiler_flag()
+      if(HAVE_ACLE_FLAG)
+        add_definitions(-DARM_ACLE)
+        set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
+        set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
+        list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
+      else()
+        set(WITH_ACLE OFF)
+      endif()
+    else()
+      set(WITH_ACLE OFF)
+    endif()
+    if(WITH_NEON)
+      check_neon_compiler_flag()
+      if(NEON_AVAILABLE)
+        add_definitions(-DARM_NEON)
+        set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
+          ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
+        list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
+        set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
+        if(MSVC)
+          add_definitions(-D__ARM_NEON__)
+        endif()
+        check_neon_ld4_intrinsics()
+        if(NEON_HAS_LD4)
+          add_definitions(-DARM_NEON_HASLD4)
+        endif()
+      else()
+        set(WITH_NEON OFF)
+      endif()
+    endif()
+    if(WITH_ARMV6)
+      check_armv6_compiler_flag()
+      if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
+        add_definitions(-DARM_SIMD)
+        set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
+        set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
+        list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
+        if(HAVE_ARMV6_INTRIN)
+          add_definitions(-DARM_SIMD_INTRIN)
+        endif()
+      else()
+        set(WITH_ARMV6 OFF)
+      endif()
+    else()
+      set(WITH_ARMV6 OFF)
+    endif()
+  endif()
+  if(BASEARCH_PPC_FOUND)
+    # Common arch detection code
+    if(WITH_ALTIVEC)
+      check_ppc_intrinsics()
+    endif()
+    if(WITH_POWER8)
+      check_power8_intrinsics()
+    endif()
+    if(WITH_POWER9)
+      check_power9_intrinsics()
+    endif()
+    if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
+      add_definitions(-DPOWER_FEATURES)
+      list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
+      list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
+    endif()
+    # VMX specific options and files
+    if(WITH_ALTIVEC)
+      if(HAVE_VMX)
+        add_definitions(-DPPC_FEATURES)
+        if(HAVE_ALTIVEC)
+          add_definitions(-DPPC_VMX)
+          set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
+          list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
+          set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
+        else()
+          set(WITH_ALTIVEC OFF)
+        endif()
+      endif()
+    endif()
+    # Power8 specific options and files
+    if(WITH_POWER8)
+      if(HAVE_POWER8_INTRIN)
+        add_definitions(-DPOWER8_VSX)
+        set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
+        if("${ARCH}" MATCHES "powerpc64(le)?")
+          add_definitions(-DPOWER8_VSX_CRC32)
+          list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
+        endif()
+        list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
+        set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_POWER8 OFF)
+      endif()
+    endif()
+    # Power9 specific options and files
+    if(WITH_POWER9)
+      if(HAVE_POWER9_INTRIN)
+        add_definitions(-DPOWER9)
+        set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
+        list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
+        set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_POWER9 OFF)
+      endif()
+    endif()
+  endif()
+  if(BASEARCH_RISCV_FOUND)
+    if(WITH_RVV)
+      check_rvv_intrinsics()
+      if(HAVE_RVV_INTRIN)
+        add_definitions(-DRISCV_FEATURES)
+        add_definitions(-DRISCV_RVV)
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
+        list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
+        # FIXME: we will not set compile flags for riscv_features.c when
+        # the kernels update hwcap or hwprobe for riscv
+        set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
+        list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
+        set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_RVV OFF)
+      endif()
+    endif()
+  endif()
+  if(BASEARCH_X86_FOUND)
+    add_definitions(-DX86_FEATURES)
+    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
+    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
+    if(MSVC)
+      list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
+    endif()
+    if(WITH_AVX2)
+      check_avx2_intrinsics()
+      if(HAVE_AVX2_INTRIN)
+        add_definitions(-DX86_AVX2)
+        set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
+        set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX2 OFF)
+      endif()
+    endif()
+    if(WITH_AVX512)
+      check_avx512_intrinsics()
+      if(HAVE_AVX512_INTRIN)
+        add_definitions(-DX86_AVX512)
+        list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+        if(HAVE_MASK_INTRIN)
+          add_definitions(-DX86_MASK_INTRIN)
+        endif()
+        set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX512 OFF)
+      endif()
+    endif()
+    if(WITH_AVX512VNNI)
+      check_avx512vnni_intrinsics()
+      if(HAVE_AVX512VNNI_INTRIN)
+        add_definitions(-DX86_AVX512VNNI)
+        list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+        set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX512VNNI OFF)
+      endif()
+    endif()
+    if(WITH_SSE42)
+      check_sse42_intrinsics()
+      if(HAVE_SSE42_INTRIN)
+        add_definitions(-DX86_SSE42)
+        set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
+        set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_SSE42 OFF)
+      endif()
+    endif()
+    if(WITH_SSE2)
+      check_sse2_intrinsics()
+      if(HAVE_SSE2_INTRIN)
+        add_definitions(-DX86_SSE2)
+        set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
+        if(NOT ${ARCH} MATCHES "x86_64")
+          set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
+          add_definitions(-DX86_NOCHECK_SSE2)
+        endif()
+      else()
+        set(WITH_SSE2 OFF)
+      endif()
+    endif()
+    if(WITH_SSSE3)
+      check_ssse3_intrinsics()
+      if(HAVE_SSSE3_INTRIN)
+        add_definitions(-DX86_SSSE3)
+        set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
+        set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_SSSE3 OFF)
+      endif()
+    endif()
+    if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
+      check_pclmulqdq_intrinsics()
+      if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
+        add_definitions(-DX86_PCLMULQDQ_CRC)
+        set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
+        list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
+        set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+
+        if(WITH_VPCLMULQDQ AND WITH_AVX512)
+          check_vpclmulqdq_intrinsics()
+          if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
+            add_definitions(-DX86_VPCLMULQDQ_CRC)
+            set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
+            list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+            set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+          else()
+            set(WITH_VPCLMULQDQ OFF)
+          endif()
+        else()
+          set(WITH_VPCLMULQDQ OFF)
+        endif()
+      else()
+        set(WITH_PCLMULQDQ OFF)
+        set(WITH_VPCLMULQDQ OFF)
+      endif()
+    else()
+      set(WITH_PCLMULQDQ OFF)
+      set(WITH_VPCLMULQDQ OFF)
+    endif()
+    check_xsave_intrinsics()
+    if(HAVE_XSAVE_INTRIN)
+      set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
+    endif()
+  endif()
+endif()
+
+#============================================================================
+# zconf.h
+#============================================================================
+
+macro(generate_cmakein input output)
+  file(REMOVE ${output})
+  file(STRINGS ${input} _lines)
+  foreach(_line IN LISTS _lines)
+    string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
+    string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
+    if(NEED_PTRDIFF_T)
+      string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
+    endif()
+    file(APPEND ${output} "${_line}\n")
+  endforeach()
+endmacro(generate_cmakein)
+
+generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein )
+
+#============================================================================
+# zlib
+#============================================================================
+
+set(ZLIB_PUBLIC_HDRS
+    ${CMAKE_CURRENT_BINARY_DIR}/zconf.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib.h
+)
+set(ZLIB_PRIVATE_HDRS
+    adler32_p.h
+    chunkset_tpl.h
+    compare256_rle.h
+    cpu_features.h
+    crc32_braid_p.h
+    crc32_braid_comb_p.h
+    crc32_braid_tbl.h
+    crc32_fold.h
+    deflate.h
+    deflate_p.h
+    functable.h
+    inffast_tpl.h
+    inffixed_tbl.h
+    inflate.h
+    inflate_p.h
+    inftrees.h
+    insert_string_tpl.h
+    match_tpl.h
+    trees.h
+    trees_emit.h
+    trees_tbl.h
+    zbuild.h
+    zendian.h
+    zutil.h
+)
+set(ZLIB_SRCS
+    adler32.c
+    adler32_fold.c
+    chunkset.c
+    compare256.c
+    compress.c
+    cpu_features.c
+    crc32_braid.c
+    crc32_braid_comb.c
+    crc32_fold.c
+    deflate.c
+    deflate_fast.c
+    deflate_huff.c
+    deflate_medium.c
+    deflate_quick.c
+    deflate_rle.c
+    deflate_slow.c
+    deflate_stored.c
+    functable.c
+    infback.c
+    inflate.c
+    inftrees.c
+    insert_string.c
+    insert_string_roll.c
+    slide_hash.c
+    trees.c
+    uncompr.c
+    zutil.c
+)
+
+set(ZLIB_GZFILE_PRIVATE_HDRS
+    gzguts.h
+)
+set(ZLIB_GZFILE_SRCS
+    gzlib.c
+    ${CMAKE_CURRENT_BINARY_DIR}/gzread.c
+    gzwrite.c
+)
+
+set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
+list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
+
+add_library(zlib STATIC ${ZLIB_ALL_SRCS})
+
+target_include_directories(zlib PUBLIC
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>"
+  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
+
+if(HAVE_UNISTD_H)
+  SET(ZCONF_UNISTD_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
+else()
+  SET(ZCONF_UNISTD_LINE "#if 0    /* was set to #if 0 by configure/cmake/etc */")
+endif()
+if(NEED_PTRDIFF_T)
+  SET(ZCONF_PTRDIFF_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
+else()
+  SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */")
+endif()
+
+configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein
+  ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
+  ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
+  ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
+
+ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes
+  -Wundef
+  -Wmissing-declarations
+)
+
+set_target_properties(${ZLIB_LIBRARY} PROPERTIES
+  OUTPUT_NAME ${ZLIB_LIBRARY}
+  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+  COMPILE_PDB_NAME ${ZLIB_LIBRARY}
+  COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}"
+  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+)
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+
+ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md)
--- a/3rdparty/zlib-ng/LICENSE.md
+++ b/3rdparty/zlib-ng/LICENSE.md
@ -0,0 +1,19 @@
+(C) 1995-2013 Jean-loup Gailly and Mark Adler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
--- a/3rdparty/zlib-ng/README.md
+++ b/3rdparty/zlib-ng/README.md
@ -0,0 +1,229 @@
+| CI | Stable | Develop |
+|:---|:-------|:--------|
+| GitHub Actions | [![Stable CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Astable) <br> [![Stable Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Astable) <br> [![Stable NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Astable) | [![Develop CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Adevelop) <br> [![Develop Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Adevelop) <br> [![Develop NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Adevelop) |
+| CodeFactor     | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
+| OSS-Fuzz       | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
+| Codecov        | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
+
+## zlib-ng
+*zlib data compression library for the next generation systems*
+
+Maintained by Hans Kristian Rosbach
+          aka Dead2 (zlib-ng àt circlestorm dót org)
+
+Features
+--------
+
+* Zlib compatible API with support for dual-linking
+* Modernized native API based on zlib API for ease of porting
+* Modern C11 syntax and a clean code layout
+* Deflate medium and quick algorithms based on Intel’s zlib fork
+* Support for CPU intrinsics when available
+  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
+  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
+  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
+  * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
+  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
+  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
+  * Support for hardware-accelerated deflate using IBM Z DFLTCC
+* Unaligned memory read/writes and large bit buffer improvements
+* Includes improvements from Cloudflare and Intel forks
+* Configure, CMake, and NMake build system support
+* Comprehensive set of CMake unit tests
+* Code sanitizers, fuzzing, and coverage
+* GitHub Actions continuous integration on Windows, macOS, and Linux
+  * Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
+
+
+History
+-------
+
+The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
+implemented into the official zlib repository.
+
+Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
+for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
+lower threshold for code change.
+
+zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
+That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
+for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
+
+Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
+cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
+
+I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
+smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
+The result is a better performing and easier to maintain zlib-ng.
+
+A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
+small and big improvements, or valuable testing.
+
+
+Build
+-----
+<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
+
+There are two ways to build zlib-ng:
+
+### Cmake
+
+To build zlib-ng using the cross-platform makefile generator cmake.
+
+```
+cmake .
+cmake --build . --config Release
+ctest --verbose -C Release
+```
+
+Alternatively, you can use the cmake configuration GUI tool ccmake:
+
+```
+ccmake .
+```
+
+### Configure
+
+To build zlib-ng using the bash configure script:
+
+```
+./configure
+make
+make test
+```
+
+Build Options
+-------------
+
+| CMake                    | configure                | Description                                                                           | Default |
+|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
+| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
+| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
+| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
+| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
+| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
+| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
+| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
+| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
+| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
+| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
+| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+
+
+Install
+-------
+
+WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
+potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
+can make the whole system unusable, requiring recovery or reinstall.
+If you still want a manual install, we recommend using the /opt/ path prefix.
+
+For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
+the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
+will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
+
+```
+LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
+```
+
+### Cmake
+
+To install zlib-ng system-wide using cmake:
+
+```sh or powershell
+cmake --build . --target install
+```
+
+### Configure
+
+To install zlib-ng system-wide using the configure script:
+
+```sh
+make install
+```
+
+### CPack
+
+After building with cmake, an installation package can be created using cpack. By default a tgz package is created,
+but you can append `-G <format>` to each command to generate alternative packages types (TGZ, ZIP, RPM, DEB). To easily
+create a rpm or deb package, you would use `-G RPM` or `-G DEB` respectively.
+
+```sh or powershell
+cd build
+cpack --config CPackConfig.cmake
+cpack --config CPackSourceConfig.cmake
+```
+
+### Vcpkg
+
+Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+```sh or powershell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
+./vcpkg integrate install
+./vcpkg install zlib-ng
+```
+
+The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
+If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+Contributing
+------------
+
+Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
+Help with testing and reviewing pull requests etc is also very much appreciated.
+
+Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
+
+Acknowledgments
+----------------
+
+Thanks go out to all the people and companies who have taken the time to contribute
+code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
+
+The deflate format used by zlib was defined by Phil Katz.<br>
+The deflate and zlib specifications were written by L. Peter Deutsch.
+
+zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
+
+
+Advanced Build Options
+----------------------
+
+| CMake                           | configure             | Description                                                         | Default                |
+|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
+| FORCE_SSE2                      | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
+| WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
+| WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
+| WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
+| WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
+| WITH_SSSE3                      |                       | Build with SSSE3 intrinsics                                         | ON                     |
+| WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
+| WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
+| WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
+| WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
+| WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
+| WITH_ARMV6                      | --without-armv6       | Build with ARMv6 intrinsics                                         | ON                     |
+| WITH_ALTIVEC                    | --without-altivec     | Build with AltiVec (VMX) intrinsics                                 | ON                     |
+| WITH_POWER8                     | --without-power8      | Build with POWER8 optimisations                                     | ON                     |
+| WITH_RVV                        |                       | Build with RVV intrinsics                                           | ON                     |
+| WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
+| WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
+| WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
+| WITH_UNALIGNED                  | --without-unaligned   | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
+| WITH_INFLATE_STRICT             |                       | Build with strict inflate distance checking                         | OFF                    |
+| WITH_INFLATE_ALLOW_INVALID_DIST |                       | Build with zero fill for inflate invalid distances                  | OFF                    |
+| INSTALL_UTILS                   |                       | Copy minigzip and minideflate during install                        | OFF                    |
+| ZLIBNG_ENABLE_TESTS             |                       | Test zlib-ng specific API                                           | ON                     |
+
+
+Related Projects
+----------------
+
+* Fork of the popular minizip                   https://github.com/zlib-ng/minizip-ng
+* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
+* Python tool to benchmark pigz                 https://github.com/zlib-ng/pigzbench
+* 3rd party patches for zlib-ng compatibility   https://github.com/zlib-ng/patches
--- a/3rdparty/zlib-ng/adler32.c
+++ b/3rdparty/zlib-ng/adler32.c
@ -0,0 +1,115 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    uint32_t sum1;
+    uint32_t sum2;
+    unsigned rem;
+
+    /* for negative len, return invalid adler32 as a clue for debugging */
+    if (len2 < 0)
+        return 0xffffffff;
+
+    /* the derivation of this formula is left as an exercise for the reader */
+    len2 %= BASE;                 /* assumes len2 >= 0 */
+    rem = (unsigned)len2;
+    sum1 = adler1 & 0xffff;
+    sum2 = rem * sum1;
+    sum2 %= BASE;
+    sum1 += (adler2 & 0xffff) + BASE - 1;
+    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
+    if (sum2 >= BASE) sum2 -= BASE;
+    return sum1 | (sum2 << 16);
+}
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+
+unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+#else
+uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    return adler32_combine_(adler1, adler2, len2);
+}
+#endif
--- a/3rdparty/zlib-ng/adler32_fold.c
+++ b/3rdparty/zlib-ng/adler32_fold.c
@ -0,0 +1,16 @@
+/* adler32_fold.c -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_fold.h"
+
+#include <limits.h>
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = functable.adler32(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
--- a/3rdparty/zlib-ng/adler32_fold.h
+++ b/3rdparty/zlib-ng/adler32_fold.h
@ -0,0 +1,11 @@
+/* adler32_fold.h -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_FOLD_H_
+#define ADLER32_FOLD_H_
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+#endif
--- a/3rdparty/zlib-ng/adler32_p.h
+++ b/3rdparty/zlib-ng/adler32_p.h
@ -0,0 +1,70 @@
+/* adler32_p.h -- Private inline functions and macros shared with
+ *                different computation of the Adler-32 checksum
+ *                of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(sum1, sum2, buf, i)  {(sum1) += buf[(i)]; (sum2) += (sum1);}
+#define DO2(sum1, sum2, buf, i)  {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
+#define DO4(sum1, sum2, buf, i)  {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
+#define DO8(sum1, sum2, buf, i)  {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
+#define DO16(sum1, sum2, buf)    {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
+
+static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
+    adler += buf[0];
+    adler %= BASE;
+    sum2 += adler;
+    sum2 %= BASE;
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++;
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+#ifdef UNROLL_MORE
+    while (len >= 16) {
+        len -= 16;
+        DO16(adler, sum2, buf);
+        buf += 16;
+#else
+    while (len >= 8) {
+        len -= 8;
+        DO8(adler, sum2, buf, 0);
+        buf += 8;
+#endif
+    }
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif /* ADLER32_P_H */
--- a/3rdparty/zlib-ng/arch/.gitignore
+++ b/3rdparty/zlib-ng/arch/.gitignore
@ -0,0 +1,2 @@
+# ignore Makefiles; they're all automatically generated
+Makefile
--- a/3rdparty/zlib-ng/arch/arm/Makefile.in
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@ -0,0 +1,85 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ACLEFLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	adler32_neon.o adler32_neon.lo \
+	arm_features.o arm_features.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	compare256_neon.o compare256_neon.lo \
+	crc32_acle.o crc32_acle.lo \
+	slide_hash_neon.o slide_hash_neon.lo \
+	slide_hash_armv6.o slide_hash_armv6.lo \
+	insert_string_acle.o insert_string_acle.lo
+
+adler32_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+crc32_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+slide_hash_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+	$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+insert_string_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+insert_string_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/arm/acle_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/acle_intrins.h
@ -0,0 +1,35 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+#  include <arm_acle.h>
+#endif
+
+#ifdef ARM_ACLE
+#if defined(__aarch64__)
+#  define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+#  define Z_TARGET_CRC
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+    uint16x2_t __c;
+    __asm__ __volatile__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
--- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@ -0,0 +1,215 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
+        32, 31, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 11, 10, 9,
+        8, 7, 6, 5, 4, 3, 2, 1 };
+
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t pair[2];
+    int n = NMAX;
+    unsigned int done = 0;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        NEON_accum32(pair, buf + done, n >> 4);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        NEON_handle_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/arm/arm_features.c
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@ -0,0 +1,100 @@
+#include "../../zbuild.h"
+#include "arm_features.h"
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#  ifdef ARM_ASM_HWCAP
+#    include <asm/hwcap.h>
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
+#  include <sys/sysctl.h>
+#elif defined(_WIN32)
+#  include <windows.h>
+#endif
+
+static int arm_has_crc32() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
+#  ifdef HWCAP_CRC32
+    return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+    return getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__APPLE__)
+    int hascrc32;
+    size_t size = sizeof(hascrc32);
+    return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
+      && hascrc32 == 1;
+#elif defined(_WIN32)
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#elif defined(ARM_NOCHECK_ACLE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+/* AArch64 has neon. */
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+static inline int arm_has_neon() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
+#  ifdef HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
+#  endif
+#elif defined(__APPLE__)
+    int hasneon;
+    size_t size = sizeof(hasneon);
+    return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
+      && hasneon == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+    return 1; /* Always supported */
+#  endif
+#endif
+
+#if defined(ARM_NOCHECK_NEON)
+    return 1;
+#else
+    return 0;
+#endif
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+static inline int arm_has_simd() {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    const char *platform = (const char *)getauxval(AT_PLATFORM);
+    return strncmp(platform, "v6l", 3) == 0
+        || strncmp(platform, "v7l", 3) == 0
+        || strncmp(platform, "v8l", 3) == 0;
+#elif defined(ARM_NOCHECK_SIMD)
+    return 1;
+#else
+    return 0;
+#endif
+}
+#endif
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    features->has_simd = 0; /* never available */
+    features->has_neon = 1; /* always available */
+#else
+    features->has_simd = arm_has_simd();
+    features->has_neon = arm_has_neon();
+#endif
+    features->has_crc32 = arm_has_crc32();
+}
--- a/3rdparty/zlib-ng/arch/arm/arm_features.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@ -0,0 +1,16 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_H_
+#define ARM_H_
+
+struct arm_cpu_features {
+    int has_simd;
+    int has_neon;
+    int has_crc32;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_H_ */
--- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@ -0,0 +1,99 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../generic/chunk_permute_table.h"
+
+typedef uint8x16_t chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+
+    /* This version of table is only available on aarch64 */
+#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
+    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab = {{a, b}};
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@ -0,0 +1,59 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@ -0,0 +1,78 @@
+/* crc32_acle.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+*/
+
+#ifdef ARM_ACLE
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+    Z_REGISTER const uint16_t *buf2;
+    Z_REGISTER const uint32_t *buf4;
+    Z_REGISTER const uint64_t *buf8;
+
+    c = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        c = __crc32b(c, *buf);
+        c = ~c;
+        return c;
+    }
+
+    if ((ptrdiff_t)buf & (sizeof(uint64_t) - 1)) {
+        if (len && ((ptrdiff_t)buf & 1)) {
+            c = __crc32b(c, *buf++);
+            len--;
+        }
+
+        if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
+            buf2 = (const uint16_t *) buf;
+            c = __crc32h(c, *buf2++);
+            len -= sizeof(uint16_t);
+            buf4 = (const uint32_t *) buf2;
+        } else {
+            buf4 = (const uint32_t *) buf;
+        }
+
+        if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
+            c = __crc32w(c, *buf4++);
+            len -= sizeof(uint32_t);
+        }
+
+        buf8 = (const uint64_t *) buf4;
+    } else {
+        buf8 = (const uint64_t *) buf;
+    }
+
+    while (len >= sizeof(uint64_t)) {
+        c = __crc32d(c, *buf8++);
+        len -= sizeof(uint64_t);
+    }
+
+    if (len >= sizeof(uint32_t)) {
+        buf4 = (const uint32_t *) buf8;
+        c = __crc32w(c, *buf4++);
+        len -= sizeof(uint32_t);
+        buf2 = (const uint16_t *) buf4;
+    } else {
+        buf2 = (const uint16_t *) buf8;
+    }
+
+    if (len >= sizeof(uint16_t)) {
+        c = __crc32h(c, *buf2++);
+        len -= sizeof(uint16_t);
+    }
+
+    buf = (const unsigned char *) buf2;
+    if (len) {
+        c = __crc32b(c, *buf);
+    }
+
+    c = ~c;
+    return c;
+}
+#endif
--- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
@ -0,0 +1,24 @@
+/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifdef ARM_ACLE
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#define HASH_CALC(s, h, val) \
+    h = __crc32w(0, val)
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
+#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
+#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
+
+#include "../../insert_string_tpl.h"
+#endif
--- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@ -0,0 +1,58 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+/* arm64_neon.h is MSVC specific */
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+
+#  ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+    uint16x8x4_t ret = (uint16x8x4_t) {{
+                          vld1q_u16(a),
+                          vld1q_u16(a+8),
+                          vld1q_u16(a+16),
+                          vld1q_u16(a+24)}};
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#  endif // HASLD4 check
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@ -0,0 +1,47 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_SIMD)
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x2_t v;
+    uint16x2_t p0, p1, p2, p3;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = wsize | (wsize << 16);
+
+    n = size / (sizeof(uint16x2_t) * 4);
+    do {
+        p0 = *((const uint16x2_t *)(table));
+        p1 = *((const uint16x2_t *)(table+2));
+        p2 = *((const uint16x2_t *)(table+4));
+        p3 = *((const uint16x2_t *)(table+6));
+        p0 = __uqsub16(p0, v);
+        p1 = __uqsub16(p1, v);
+        p2 = __uqsub16(p2, v);
+        p3 = __uqsub16(p3, v);
+        *((uint16x2_t *)(table)) = p0;
+        *((uint16x2_t *)(table+2)) = p1;
+        *((uint16x2_t *)(table+4)) = p2;
+        *((uint16x2_t *)(table+6)) = p3;
+        table += 8;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@ -0,0 +1,46 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(wsize);
+
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p0 = vld1q_u16_x4(table);
+        p1 = vld1q_u16_x4(table+32);
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4(table, p0);
+        vst1q_u16_x4(table+32, p1);
+        table += 64;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
--- a/3rdparty/zlib-ng/arch/generic/Makefile.in
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@ -0,0 +1,24 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all:
+
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~ \
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
+++ b/3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
--- a/3rdparty/zlib-ng/arch/power/Makefile.in
+++ b/3rdparty/zlib-ng/arch/power/Makefile.in
@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+     power_features.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
+
+power_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+	$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+	$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+	$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+	$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+	$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+	$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/power/adler32_power8.c
+++ b/3rdparty/zlib-ng/arch/power/adler32_power8.c
@ -0,0 +1,153 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "adler32_p.h"
+
+/* Vector across sum unsigned int (saturate).  */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(s1, buf, s2);
+
+    /* If buffer is empty or len=0 we need to return adler initial value.  */
+    if (UNLIKELY(buf == NULL))
+        return 1;
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_len_64(s1, buf, len, s2);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(s1, buf, len, s2);
+}
+
+#endif /* POWER8_VSX */
--- a/3rdparty/zlib-ng/arch/power/adler32_vmx.c
+++ b/3rdparty/zlib-ng/arch/power/adler32_vmx.c
@ -0,0 +1,186 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+#include <altivec.h>
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    /* Different taps for the separable components of sums */
+    const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+    const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+    const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+    const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+    vector unsigned int  adacc, s2acc;
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+    s2acc = vec_sro(pair_vec, shift_vec);
+#else
+    s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+    vector unsigned int zero = vmx_zero();
+    vector unsigned int s3acc = zero;
+    vector unsigned int s3acc_0 = zero;
+    vector unsigned int adacc_prev = adacc;
+    vector unsigned int adacc_prev_0 = zero;
+
+    vector unsigned int s2acc_0 = zero;
+    vector unsigned int s2acc_1 = zero;
+    vector unsigned int s2acc_2 = zero;
+
+    /* Maintain a running sum of a second half, this might help use break yet another
+     * data dependency bubble in the sum */
+    vector unsigned int adacc_0 = zero;
+
+    int num_iter = len / 4;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+        vector unsigned char d2 = vec_ld(32, buf);
+        vector unsigned char d3 = vec_ld(48, buf);
+
+        /* The core operation of the loop, basically
+         * what is being unrolled below */
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_prev);
+        s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+        s2acc = vec_msum(t0, d0, s2acc);
+
+        /* interleave dependent sums in here */
+        adacc_0 = vec_sum4s(d1, adacc_0);
+        s2acc_0 = vec_msum(t1, d1, s2acc_0);
+        adacc = vec_sum4s(d2, adacc);
+        s2acc_1 = vec_msum(t2, d2, s2acc_1);
+        s2acc_2 = vec_msum(t3, d3, s2acc_2);
+        adacc_0 = vec_sum4s(d3, adacc_0);
+
+        adacc_prev = adacc;
+        adacc_prev_0 = adacc_0;
+        buf += 64;
+    }
+
+    adacc = vec_add(adacc, adacc_0);
+    s3acc = vec_add(s3acc, s3acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+    if (rem) {
+        adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+        adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+        while (rem--) {
+            vector unsigned char d0 = vec_ld(0, buf);
+            adacc = vec_sum4s(d0, adacc);
+            s3acc = vec_add(s3acc, adacc_prev);
+            s2acc = vec_msum(t3, d0, s2acc);
+            adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+            buf += 16;
+        }
+    }
+
+
+    /* Sum up independent second sums */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s2acc_2 = vec_add(s2acc_1, s2acc_2);
+    s2acc = vec_add(s2acc, s2acc_2);
+
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    uint32_t pair[16] ALIGNED_(16);
+    memset(&pair[2], 0, 14);
+    int n = NMAX;
+    unsigned int done = 0, i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    // Align buffer
+    unsigned int al = 0;
+    if ((uintptr_t)buf & 0xf) {
+        al = 16-((uintptr_t)buf & 0xf);
+        if (al > len) {
+            al=len;
+        }
+        vmx_handle_head_or_tail(pair, buf, al);
+
+        done += al;
+        /* Rather than rebasing, we can reduce the max sums for the
+         * first round only */
+        n -= al;
+    }
+    for (i = al; i < len; i += n) {
+        int remaining = (int)(len-i);
+        n = MIN(remaining, (i == al) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        vmx_accum32(pair, buf + i, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        vmx_handle_head_or_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
--- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@ -0,0 +1,55 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+#include <altivec.h>
+#include "../../zbuild.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats((unsigned long long)tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/power/compare256_power9.c
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@ -0,0 +1,64 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "../../zbuild.h"
+#include "../../zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+        zng_vec_vctzlsbb(vc, cmplen);
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/power/crc32_constants.h
+++ b/3rdparty/zlib-ng/arch/power/crc32_constants.h
--- a/3rdparty/zlib-ng/arch/power/crc32_power8.c
+++ b/3rdparty/zlib-ng/arch/power/crc32_power8.c
@ -0,0 +1,589 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#include <altivec.h>
+#include "zendian.h"
+#include "zbuild.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#if defined (__clang__)
+#include "fallback_builtins.h"
+#endif
+
+#define MAX_SIZE    32768
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+    unsigned int prealign;
+    unsigned int tail;
+
+    unsigned long len = (unsigned long) _len;
+
+    if (p == (const unsigned char *) 0x0)
+        return 0;
+
+    crc ^= 0xffffffff;
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += len & ~VMX_ALIGN_MASK;
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+    crc ^= 0xffffffff;
+
+    return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+    const __vector unsigned long long vzero = {0,0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+    const __vector unsigned long long vmask_32bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+    const __vector unsigned long long vmask_64bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0,0};
+    __vector unsigned long long v1 = {0,0};
+    __vector unsigned long long v2 = {0,0};
+    __vector unsigned long long v3 = {0,0};
+    __vector unsigned long long v4 = {0,0};
+    __vector unsigned long long v5 = {0,0};
+    __vector unsigned long long v6 = {0,0};
+    __vector unsigned long long v7 = {0,0};
+
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+        vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+        vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+        vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each
+             * constant is 16 bytes, and it is used against 128 bytes of input
+             * data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE/8) - (block_size/8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size/128)-1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                           (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                           (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                           (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                           (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                           (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                           (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                           (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                           (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. Each iteration calculates the CRC for a 128-byte
+                 * block.
+                 */
+                for (i = 0; i < chunks-2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+                                                   vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down */
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                               (__vector unsigned long long)vconst1);
+            }/* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits
+             * of the register. Since we are bit reflected we have to shift it
+             * left 32 bits so it occupies the least significant bits in the
+             * bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                                      (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+                                                      (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+                                                      (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+                                                      (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+                                                      (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+                                                      (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+                                                      (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+                                                      (__vector unsigned char)vzero, 4);
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long*) p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long*) p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long*) p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long*) p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long*) p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long*) p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long*) p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long*) p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i+=16) {
+            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i,vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1,v0);
+
+    /* shift left one bit */
+    __vector unsigned char vsht_splat = vec_splat_u8 (1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+    v0 = vec_and(v0, vmask_64bit);
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit
+     * reflecting our data (which is expensive to do), we bit reflect our
+     * constants and our algorithm, which means the intermediate data in
+     * our vector registers goes from 0-63 instead of 63-0. We can reflect
+     * the algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor (v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in
+     * the high 32 bits. We just need to shift it left 4 bytes
+     * V0 [ 0 1 X 3 ]
+     * V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return v0[0];
+#else
+    return v0[1];
+#endif
+}
--- a/3rdparty/zlib-ng/arch/power/fallback_builtins.h
+++ b/3rdparty/zlib-ng/arch/power/fallback_builtins.h
@ -0,0 +1,31 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_BUILTINS_H
+#define POWER_BUILTINS_H
+
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+    return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/power/power_features.c
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@ -0,0 +1,46 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
+#include "../../zbuild.h"
+#include "power_features.h"
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+    hwcap = getauxval(AT_HWCAP);
+#endif
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+    unsigned long hwcap2;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+    hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+        features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        features->has_arch_3_00 = 1;
+#endif
+#endif
+}
--- a/3rdparty/zlib-ng/arch/power/power_features.h
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_H_
+#define POWER_H_
+
+struct power_cpu_features {
+    int has_altivec;
+    int has_arch_2_07;
+    int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_H_ */
--- a/3rdparty/zlib-ng/arch/power/slide_hash_power8.c
+++ b/3rdparty/zlib-ng/arch/power/slide_hash_power8.c
@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
--- a/3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
+++ b/3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
--- a/3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
+++ b/3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
@ -0,0 +1,31 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        vector unsigned short value, result;
+
+        value = vec_ld(0, p);
+        result = vec_subs(value, vmx_wsize);
+        vec_st(result, 0, p);
+
+        p += 8;
+        entries -= 8;
+   } while (entries > 0);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+    uint16_t wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
--- a/3rdparty/zlib-ng/arch/riscv/README.md
+++ b/3rdparty/zlib-ng/arch/riscv/README.md
@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
+>
+> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
+> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+  -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+  -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+  -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+  -D QEMU_PATH={QEMU_PATH} \
+  .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
--- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@ -0,0 +1,132 @@
+/* adler32_rvv.c - RVV version of adler32
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+
+static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1) {
+        if (COPY) memcpy(dst, src, 1);
+        return adler32_len_1(adler, src, sum2);
+    }
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (src == NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16) {
+        if (COPY) memcpy(dst, src, len);
+        return adler32_len_16(adler, src, len, sum2);
+    }
+
+    size_t left = len;
+    size_t vl = __riscv_vsetvlmax_e8m1();
+    vl = vl > 256 ? 256 : vl;
+    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint16m2_t v_buf16_accu;
+
+    /*
+     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+     * accumulators to boost performance.
+     *
+     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+     * vl > 256 (255 * 256 <= UINT16_MAX).
+     *
+     * We accumulate 8-bit data into a 16-bit accumulator and then
+     * move the data into the 32-bit accumulator at the last iteration.
+     */
+    size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
+    while (left >= block_size) {
+        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+        size_t subprob = block_size;
+        while (subprob > 0) {
+            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+            if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+            src += vl;
+            if (COPY) dst += vl;
+            subprob -= vl;
+        }
+        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+        left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            cnt = 0;
+        }
+    }
+    /* the left len <= 256 now, we can use 16-bit accum safely */
+    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+    size_t res = left;
+    while (left >= vl) {
+        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+        if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+        src += vl;
+        if (COPY) dst += vl;
+        left -= vl;
+    }
+    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+
+    sum2 += (sum2_sum + adler * (len - left));
+
+    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+
+    adler += adler_sum;
+
+    while (left--) {
+        if (COPY) *dst++ = *src;
+        adler += *src++;
+        sum2 += adler;
+    }
+
+    sum2 %= BASE;
+    adler %= BASE;
+
+    return adler | (sum2 << 16);
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_rvv_impl(adler, dst, src, len, 1);
+}
+
+Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
+    return adler32_rvv_impl(adler, NULL, buf, len, 0);
+}
+
+#endif // RISCV_RVV
--- a/3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
@ -0,0 +1,121 @@
+/* chunkset_rvv.c - RVV version of chunkset
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include <riscv_vector.h>
+#include "zbuild.h"
+
+/*
+ * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * so we prefer using large size chunk and copy memory as much as possible.
+ */
+#define CHUNK_SIZE 32
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(elen)                                     \
+do {                                                                    \
+    size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t);               \
+    uint##elen##_t val = *(uint##elen##_t*)from;                        \
+    uint##elen##_t* chunk_p = (uint##elen##_t*)chunk;                   \
+    do {                                                                \
+        vl = __riscv_vsetvl_e##elen##m4(len);                           \
+        vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+        __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl);          \
+        len -= vl; chunk_p += vl;                                       \
+    } while (len > 0);                                                  \
+} while (0)
+
+/* We don't have a 32-byte datatype for RISC-V arch. */
+typedef struct chunk_s {
+    uint64_t data[4];
+} chunk_t;
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(64);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk->data, CHUNK_SIZE);
+}
+
+#define CHUNKSIZE        chunksize_rvv
+#define CHUNKCOPY        chunkcopy_rvv
+#define CHUNKUNROLL      chunkunroll_rvv
+#define CHUNKMEMSET      chunkmemset_rvv
+#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
+
+#define HAVE_CHUNKCOPY
+
+/*
+ * Assuming that the length is non-zero, and that `from` lags `out` by at least
+ * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
+ *
+ * We load/store a single chunk once in the `CHUNKCOPY`.
+ * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * such that, we prefer copy large memory size once to make good use of the the RVV advance.
+ * 
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
+ * but we still copy as much memory as possible for some conditions.
+ * 
+ * case 1: out - from >= len (no overlap)
+ *         We can use memcpy to copy `len` size once
+ *         because the memory layout would be the same.
+ *
+ * case 2: overlap
+ *         We copy N chunks using memcpy at once, aiming to achieve our goal: 
+ *         to copy as much memory as possible.
+ * 
+ *         After using a single memcpy to copy N chunks, we have to use series of
+ *         loadchunk and storechunk to ensure the result is correct.
+ */
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    memcpy(out, from, sizeof(chunk_t));
+    out += align;
+    from += align;
+    len -= align;
+    ptrdiff_t dist = out - from;
+    if (dist >= len) {
+        memcpy(out, from, len);
+        out += len;
+        from += len;
+        return out;
+    }
+    if (dist >= sizeof(chunk_t)) {
+        dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
+        memcpy(out, from, dist);
+        out += dist;
+        from += dist;
+        len -= dist;
+    }
+    while (len > 0) {
+        memcpy(out, from, sizeof(chunk_t));
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+    return out;
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_rvv
+
+#include "inffast_tpl.h"
--- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@ -0,0 +1,47 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "../../zbuild.h"
+#include "fallback_builtins.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    size_t vl;
+    long found_diff;
+    do {
+        vl = __riscv_vsetvl_e8m4(256 - len);
+        vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+        vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+        vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+        found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+        if (found_diff >= 0)
+            return len + (uint32_t)found_diff;
+        src0 += vl, src1 += vl, len += vl;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@ -0,0 +1,45 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/utsname.h>
+
+#include "../../zbuild.h"
+#include "riscv_features.h"
+
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+
+int Z_INTERNAL is_kernel_version_greater_or_equal_to_6_5() {
+    struct utsname buffer;
+    uname(&buffer);
+
+    int major, minor;
+    if (sscanf(buffer.release, "%d.%d", &major, &minor) != 2) {
+        // Something bad with uname()
+        return 0;
+    }
+
+    if (major > 6 || major == 6 && minor >= 5)
+        return 1;
+    return 0;
+}
+
+void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *features) {
+#if defined(__riscv_v) && defined(__linux__)
+    features->has_rvv = 1;
+#else
+    features->has_rvv = 0;
+#endif
+}
+
+void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    features->has_rvv = hw_cap & ISA_V_HWCAP;
+}
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+    if (is_kernel_version_greater_or_equal_to_6_5())
+        riscv_check_features_runtime(features);
+    else
+        riscv_check_features_compile_time(features);
+}
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@ -0,0 +1,18 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_H_
+#define RISCV_H_
+
+struct riscv_cpu_features {
+    int has_rvv;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@ -0,0 +1,34 @@
+/* slide_hash_rvv.c - RVV version of slide_hash
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include <riscv_vector.h>
+
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    size_t vl;
+    while (entries > 0) {
+        vl = __riscv_vsetvl_e16m4(entries);
+        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
+        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
+        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
+        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
+        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        table += vl, entries -= vl;
+    }
+}
+
+Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif // RISCV_RVV
--- a/3rdparty/zlib-ng/arch/x86/Makefile.in
+++ b/3rdparty/zlib-ng/arch/x86/Makefile.in
@ -0,0 +1,147 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
+AVX2FLAG=-mavx2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	x86_features.o x86_features.lo \
+	adler32_avx2.o adler32_avx2.lo \
+	adler32_avx512.o adler32_avx512.lo \
+	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+	adler32_sse42.o adler32_sse42.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx2.o chunkset_avx2.lo \
+	chunkset_sse2.o chunkset_sse2.lo \
+	chunkset_ssse3.o chunkset_ssse3.lo \
+	compare256_avx2.o compare256_avx2.lo \
+	compare256_sse2.o compare256_sse2.lo \
+	insert_string_sse42.o insert_string_sse42.lo \
+	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
+	slide_hash_avx2.o slide_hash_avx2.lo \
+	slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+	$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+	$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+insert_string_sse42.o:
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+insert_string_sse42.lo:
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+crc32_pclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+crc32_vpclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+slide_hash_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
@ -0,0 +1,154 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_avx2_p.h"
+#include "x86_intrins.h"
+
+#ifdef X86_SSE42
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
+#define sub32(a, b, c) adler32_ssse3(a, b, c)
+#else
+#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
+#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
+#endif
+
+static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+        if (COPY) {
+            return adler32_copy_len_16(adler0, src, dst, len, adler1);
+        } else {
+            return adler32_len_16(adler0, src, len, adler1);
+        }
+    } else if (len < 32) {
+        if (COPY) {
+            return copy_sub32(adler, dst, src, len);
+        } else {
+            return sub32(adler, src, len);
+        }
+    }
+
+    __m256i vs1, vs2;
+
+    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+                                           14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m256i dot3v = _mm256_set1_epi16(1);
+    const __m256i zero = _mm256_setzero_si256();
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+            */
+            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+            if (COPY) {
+                _mm256_storeu_si256((__m256i*)dst, vbuf);
+                dst += 32;
+            }
+ 
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            vs2 = _mm256_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        /* Defer the multiplication with 32 to outside of the loop */
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+
+        /* The compiler is generating the following sequence for this integer modulus
+         * when done the scalar way, in GPRs:
+
+         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
+         ...
+         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
+         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
+         shr    $0x2f,%rsi // shift right by 47
+         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+         ...
+         // repeats for each element with vpextract instructions
+
+         This is tricky with AVX2 for a number of reasons:
+             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+                 back down to 32 bit precision later (there is in AVX512)
+             3.) Full width integer multiplications aren't cheap
+
+         We can, however, do a relatively cheap sequence for horizontal sums.
+         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+         performed on the maximum possible inputs before overflow
+         */
+
+
+         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
+          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+          * what the compiler is doing to avoid integer divisions. */
+         adler0 = partial_hsum256(vs1) % BASE;
+         adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+                                  _mm256_castsi256_si128(x));
+    __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros */
+    const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+    __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+    __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+    __m128i sum2  = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
@ -0,0 +1,115 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "../../cpu_features.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+
+static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 64) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        if (COPY) {
+            __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+            __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+            _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+        }
+
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    __m512i vbuf, vs1_0, vs3;
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+    const __m512i dot3v = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    size_t k;
+
+    while (len >= 64) {
+        __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        vs1_0 = vs1;
+        vs3 = _mm512_setzero_si512();
+
+        k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf = _mm512_loadu_si512(src);
+
+            if (COPY) {
+                _mm512_storeu_si512(dst, vbuf);
+                dst += 64;
+            }
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+            __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+            vs1 = _mm512_add_epi32(vs1_sad, vs1);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm512_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, dst, src, len, 1);
+}
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
+}
+
+#endif
+
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
@ -0,0 +1,225 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../cpu_features.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 32)
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    if (len < 64)
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+    const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
+
+    while (len >= 64) {
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+        __m512i vs1_0 = vs1;
+        __m512i vs3 = _mm512_setzero_si512();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m512i vs2_1 = _mm512_setzero_si512();
+        __m512i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 128) {
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 128) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
+            k -= 128;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm512_add_epi32(vs3, vs1);
+            vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+        vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler;
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
@ -0,0 +1,121 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42
+
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+    }
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = MIN(len, NMAX);
+        k -= k % 16;
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler0 | (adler1 << 16);
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
@ -0,0 +1,156 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if 16 - the remainder + len is
+     * < 16 */
+    size_t max_iters = NMAX;
+    size_t rem = (uintptr_t)buf & 15;
+    size_t align_offset = 16 - rem;
+    size_t k = 0;
+    if (rem) {
+        if (len < 16 + align_offset) {
+            /* Let's eat the cost of this one unaligned load so that
+             * we don't completely skip over the vectorization. Doing
+             * 16 bytes at a time unaligned is better than 16 + <= 15
+             * sums */
+            vbuf = _mm_loadu_si128((__m128i*)buf);
+            len -= 16;
+            buf += 16;
+            vs1 = _mm_cvtsi32_si128(adler);
+            vs2 = _mm_cvtsi32_si128(sum2);
+            vs3 = _mm_setzero_si128();
+            vs1_0 = vs1;
+            goto unaligned_jmp;
+        }
+
+        for (size_t i = 0; i < align_offset; ++i) {
+            adler += *(buf++);
+            sum2 += adler;
+        }
+
+        /* lop off the max number of sums based on the scalar sums done
+         * above */
+        len -= align_offset;
+        max_iters -= align_offset;
+    }
+
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = (len < max_iters ? len : max_iters);
+        k -= k % 16;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+            buf += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            buf += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler = partial_hsum(vs1) % BASE;
+        sum2 = hsum(vs2) % BASE;
+        max_iters = NMAX;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_srli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
@ -0,0 +1,133 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX2
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m256i chunk_t;
+
+#define CHUNK_SIZE 32
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    } else if (dist == 16) {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+    } else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_avx2
+#define CHUNKCOPY        chunkcopy_avx2
+#define CHUNKUNROLL      chunkunroll_avx2
+#define CHUNKMEMSET      chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
@ -0,0 +1,56 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
@ -0,0 +1,101 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSSE3) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    /* Important to note:
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load uninitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+
+#define CHUNKSIZE        chunksize_ssse3
+#define CHUNKMEMSET      chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
@ -0,0 +1,63 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
@ -0,0 +1,96 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@ -0,0 +1,186 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+#endif
+    unsigned long algn_diff;
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i xmm_crc_part = _mm_setzero_si128();
+#ifdef COPY
+    char ALIGNED_(16) partial_buf[16] = { 0 };
+#else
+    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
+    int32_t first = init_crc != 0;
+
+    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
+     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
+     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
+     * by definition can be up to 15 bytes + one full vector load. */
+    assert(len >= 31 || first == 0);
+#endif
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    if (len < 16) {
+#ifdef COPY
+        if (len == 0)
+            return;
+
+        memcpy(partial_buf, src, len);
+        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+        memcpy(dst, partial_buf, len);
+#endif
+        goto partial;
+    }
+
+    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+    if (algn_diff) {
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+        dst += algn_diff;
+#else
+        XOR_INITIAL128(xmm_crc_part);
+
+        if (algn_diff < 4 && init_crc != 0) {
+            xmm_t0 = xmm_crc_part;
+            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            src += 16;
+            len -= 16;
+        }
+#endif
+
+        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+
+        src += algn_diff;
+        len -= algn_diff;
+    }
+
+#ifdef X86_VPCLMULQDQ
+    if (len >= 256) {
+#ifdef COPY
+        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+        dst += n;
+#else
+        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
+            xmm_initial, first);
+        first = 0;
+#endif
+        len -= n;
+        src += n;
+    }
+#endif
+
+    while (len >= 64) {
+        len -= 64;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+        src += 64;
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+        dst += 64;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len >= 48) {
+        len -= 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        src += 48;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        dst += 48;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+    } else if (len >= 32) {
+        len -= 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        src += 32;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        dst += 32;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+    } else if (len >= 16) {
+        len -= 16;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        src += 16;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        dst += 16;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+    }
+
+partial:
+    if (len) {
+        memcpy(&xmm_crc_part, src, len);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
+        memcpy(dst, partial_buf, len);
+#endif
+        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    }
+
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+}
--- a/3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
@ -0,0 +1,107 @@
+/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
+    __m128i init_crc, int32_t first) {
+    __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+#endif
+    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+    __m512i z0, z1, z2, z3;
+    size_t len_tmp = len;
+    const __m512i zmm_fold4 = _mm512_set4_epi32(
+        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+    const __m512i zmm_fold16 = _mm512_set4_epi32(
+        0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+    // zmm register init
+    zmm_crc0 = _mm512_setzero_si512();
+    zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+#ifndef COPY
+    XOR_INITIAL512(zmm_t0);
+#endif
+    zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+    zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+    zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+    /* already have intermediate CRC in xmm registers
+        * fold4 with 4 xmm_crc to get zmm_crc0
+    */
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+
+#ifdef COPY
+    _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+    _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+    _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+    _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+    dst += 256;
+#endif
+    len -= 256;
+    src += 256;
+
+    // fold-16 loops
+    while (len >= 256) {
+        zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+        zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+        zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+        zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+        z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+        z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+        z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+        z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+        zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+        zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+        zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+        zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+        zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+        zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
+        zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
+        zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
+
+#ifdef COPY
+        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+        _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+        _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+        _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+        dst += 256;
+#endif
+        len -= 256;
+        src += 256;
+    }
+    // zmm_crc[0,1,2,3] -> zmm_crc0
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
+
+    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+    *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+    *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+    *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+    *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+    return (len_tmp - len);  // return n bytes processed
+}
--- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
@ -0,0 +1,30 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#define CRC32_FOLD_COPY  crc32_fold_pclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_pclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
+#define CRC32            crc32_pclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@ -0,0 +1,363 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+#ifdef X86_VPCLMULQDQ
+#  include <immintrin.h>
+#endif
+
+#include "../../crc32_fold.h"
+#include "../../crc32_braid_p.h"
+#include "x86_intrins.h"
+#include <assert.h>
+
+#ifdef X86_VPCLMULQDQ
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
+    int32_t first);
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc3, ps_res;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = x_tmp3;
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3, x_tmp2;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+    x_tmp3 = *xmm_crc3;
+    x_tmp2 = *xmm_crc2;
+
+    *xmm_crc3 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
+
+    *xmm_crc2 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
+
+    *xmm_crc0 = x_tmp2;
+    *xmm_crc1 = x_tmp3;
+    *xmm_crc2 = _mm_castps_si128(ps_res20);
+    *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc2;
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+    *xmm_crc2 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
+
+    *xmm_crc1 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
+
+    *xmm_crc0 = x_tmp3;
+    *xmm_crc1 = _mm_castps_si128(ps_res10);
+    *xmm_crc2 = _mm_castps_si128(ps_res21);
+    *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+    x_tmp0 = *xmm_crc0;
+    x_tmp1 = *xmm_crc1;
+    x_tmp2 = *xmm_crc2;
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_t0 = _mm_castsi128_ps(x_tmp0);
+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_t1 = _mm_castsi128_ps(x_tmp1);
+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_t2 = _mm_castsi128_ps(x_tmp2);
+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_t3 = _mm_castsi128_ps(x_tmp3);
+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+    *xmm_crc0 = _mm_castps_si128(ps_res0);
+    *xmm_crc1 = _mm_castps_si128(ps_res1);
+    *xmm_crc2 = _mm_castps_si128(ps_res2);
+    *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
+    0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+    0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+    0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
+    0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
+    0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
+    0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
+    0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
+    0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
+    0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
+    0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
+    0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
+    0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+    0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+    0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+    0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
+                         __m128i *xmm_crc3, __m128i *xmm_crc_part) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+
+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+    __m128i xmm_a0_0, xmm_a0_1;
+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+    xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
+    xmm_shr = xmm_shl;
+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+    ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
+    *fold0 = _mm_load_si128(fold + 0);
+    *fold1 = _mm_load_si128(fold + 1);
+    *fold2 = _mm_load_si128(fold + 2);
+    *fold3 = _mm_load_si128(fold + 3);
+}
+
+static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
+                                   const __m128i *fold2, const __m128i *fold3) {
+    _mm_storeu_si128(fold + 0, *fold0);
+    _mm_storeu_si128(fold + 1, *fold1);
+    _mm_storeu_si128(fold + 2, *fold2);
+    _mm_storeu_si128(fold + 3, *fold3);
+}
+
+Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
+    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    __m128i xmm_zero = _mm_setzero_si128();
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
+    return 0;
+}
+
+#define ONCE(op)                 if (first) { first = 0; op; }
+#define XOR_INITIAL128(where)    ONCE(where = _mm_xor_si128(where, xmm_initial))
+#ifdef X86_VPCLMULQDQ
+#  define XOR_INITIAL512(where)  ONCE(where = _mm512_xor_si512(where, zmm_initial))
+#endif
+
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+#define COPY
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+
+static const unsigned ALIGNED_(16) crc_k[] = {
+    0xccaa009e, 0x00000000, /* rk1 */
+    0x751997d0, 0x00000001, /* rk2 */
+    0xccaa009e, 0x00000000, /* rk5 */
+    0x63cd6124, 0x00000001, /* rk6 */
+    0xf7011640, 0x00000001, /* rk7 */
+    0xdb710640, 0x00000001  /* rk8 */
+};
+
+static const unsigned ALIGNED_(16) crc_mask[4] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+static const unsigned ALIGNED_(16) crc_mask2[4] = {
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    /*
+     * k1
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+    /*
+     * k5
+     */
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+    /*
+     * k7
+     */
+    xmm_crc1 = xmm_crc3;
+    xmm_crc2 = xmm_crc3;
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
+
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+    xmm_crc2 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+    crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
+
+    return crc->value;
+}
+
+Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
+    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
+     * these short lengths might also prove to be effective */
+    if (len < 64)
+        return PREFIX(crc32_braid)(crc32, buf, len);
+
+    crc32_fold ALIGNED_(16) crc_state;
+    CRC32_FOLD_RESET(&crc_state);
+    CRC32_FOLD(&crc_state, buf, len, crc32);
+    return CRC32_FOLD_FINAL(&crc_state);
+}
--- a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
+++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+
+#define X86_VPCLMULQDQ
+#define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_vpclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
+#define CRC32            crc32_vpclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
--- a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
@ -0,0 +1,24 @@
+/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifdef X86_SSE42
+#include "../../zbuild.h"
+#include <nmmintrin.h>
+#include "../../deflate.h"
+
+#define HASH_CALC(s, h, val)\
+    h = _mm_crc32_u32(h, val)
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         update_hash_sse42
+#define INSERT_STRING       insert_string_sse42
+#define QUICK_INSERT_STRING quick_insert_string_sse42
+
+#include "../../insert_string_tpl.h"
+#endif
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
@ -0,0 +1,39 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
+
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)table);
+        result = _mm256_subs_epu16(value, wsize);
+        _mm256_storeu_si256((__m256i *)table, result);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
@ -0,0 +1,62 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+                                    uint32_t entries1, const __m128i wsize) {
+    uint32_t entries;
+    Pos *table;
+    __m128i value0, value1, result0, result1;
+
+    int on_chain = 0;
+
+next_chain:
+    table = (on_chain) ? table1 : table0;
+    entries = (on_chain) ? entries1 : entries0;
+
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        value0 = _mm_load_si128((__m128i *)table);
+        value1 = _mm_load_si128((__m128i *)(table + 8));
+        result0 = _mm_subs_epu16(value0, wsize);
+        result1 = _mm_subs_epu16(value1, wsize);
+        _mm_store_si128((__m128i *)table, result0);
+        _mm_store_si128((__m128i *)(table + 8), result1);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+
+    ++on_chain;
+    if (on_chain > 1) {
+        return;
+    } else {
+        goto next_chain;
+    }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
--- a/3rdparty/zlib-ng/arch/x86/x86_features.c
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.c
@ -0,0 +1,97 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ *  Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "x86_features.h"
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#else
+// Newer versions of GCC and clang come with cpuid.h
+#  include <cpuid.h>
+#endif
+
+#include <string.h>
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#ifdef _MSC_VER
+    return _xgetbv(xcr);
+#else
+    uint32_t eax, edx;
+    __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+    return (uint64_t)(edx) << 32 | eax;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    features->has_sse2 = edx & 0x4000000;
+    features->has_ssse3 = ecx & 0x200;
+    features->has_sse42 = ecx & 0x100000;
+    features->has_pclmulqdq = ecx & 0x2;
+
+    if (ecx & 0x08000000) {
+        uint64_t xfeature = xgetbv(0);
+
+        features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+        features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+    }
+
+    if (maxbasic >= 7) {
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI1 bit
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        features->has_vpclmulqdq = ecx & 0x400;
+
+        // check AVX2 bit if the OS supports saving YMM registers
+        if (features->has_os_save_ymm) {
+            features->has_avx2 = ebx & 0x20;
+        }
+
+        // check AVX512 bits if the OS supports saving ZMM registers
+        if (features->has_os_save_zmm) {
+            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512vnni = ecx & 0x800;
+        }
+    }
+}
--- a/3rdparty/zlib-ng/arch/x86/x86_features.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.h
@ -0,0 +1,24 @@
+/* x86_features.h -- check for CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+    int has_avx2;
+    int has_avx512;
+    int has_avx512vnni;
+    int has_sse2;
+    int has_ssse3;
+    int has_sse42;
+    int has_pclmulqdq;
+    int has_vpclmulqdq;
+    int has_os_save_ymm;
+    int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* CPU_H_ */
--- a/3rdparty/zlib-ng/arch/x86/x86_intrins.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
@ -0,0 +1,87 @@
+#ifndef X86_INTRINS_H
+#define X86_INTRINS_H
+
+/* Unfortunately GCC didn't support these things until version 10.
+ * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
+ */
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
+    || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm256_castsi128_si256(r);
+}
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm512_castsi128_si512(r);
+}
+#endif // __AVX512F__
+#endif // gcc/AppleClang version test
+
+#endif // __AVX2__
+
+/* GCC <9 is missing some AVX512 intrinsics.
+ */
+#ifdef __AVX512F__
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
+#include <immintrin.h>
+
+#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
+                              ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
+
+static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
+                                      char __q59, char __q58, char __q57, char __q56,
+                                      char __q55, char __q54, char __q53, char __q52,
+                                      char __q51, char __q50, char __q49, char __q48,
+                                      char __q47, char __q46, char __q45, char __q44,
+                                      char __q43, char __q42, char __q41, char __q40,
+                                      char __q39, char __q38, char __q37, char __q36,
+                                      char __q35, char __q34, char __q33, char __q32,
+                                      char __q31, char __q30, char __q29, char __q28,
+                                      char __q27, char __q26, char __q25, char __q24,
+                                      char __q23, char __q22, char __q21, char __q20,
+                                      char __q19, char __q18, char __q17, char __q16,
+                                      char __q15, char __q14, char __q13, char __q12,
+                                      char __q11, char __q10, char __q09, char __q08,
+                                      char __q07, char __q06, char __q05, char __q04,
+                                      char __q03, char __q02, char __q01, char __q00) {
+    return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
+                            PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
+                            PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
+                            PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
+                            PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
+                            PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
+                            PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
+                            PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
+}
+
+#undef PACK
+
+#endif // gcc version test
+#endif // __AVX512F__
+
+/* Missing zero-extension AVX and AVX512 intrinsics.
+ * Fixed in Microsoft Visual Studio 2017 version 15.7
+ * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#ifdef __AVX2__
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
+}
+#endif // __AVX2__
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
+}
+#endif // __AVX512F__
+#endif // defined(_MSC_VER) && _MSC_VER < 1914
+
+#endif // include guard X86_INTRINS_H
--- a/3rdparty/zlib-ng/chunkset.c
+++ b/3rdparty/zlib-ng/chunkset.c
@ -0,0 +1,42 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+typedef uint64_t chunk_t;
+
+#define CHUNK_SIZE 8
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint8_t *dest = (uint8_t *)chunk;
+    memcpy(dest, from, sizeof(uint32_t));
+    memcpy(dest+4, from, sizeof(uint32_t));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    memcpy(chunk, from, sizeof(uint64_t));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk, sizeof(uint64_t));
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_c
+
+#include "inffast_tpl.h"
--- a/3rdparty/zlib-ng/chunkset_tpl.h
+++ b/3rdparty/zlib-ng/chunkset_tpl.h
@ -0,0 +1,200 @@
+/* chunkset_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include <stdlib.h>
+
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
+/* Returns the chunk size */
+Z_INTERNAL uint32_t CHUNKSIZE(void) {
+    return sizeof(chunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+   chunk_t bytes of output even if the length is shorter than this,
+   that the length is non-zero, and that `from` lags `out` by at least
+   sizeof chunk_t bytes (or that they don't overlap at all or simply that
+   the distance is less than the length of the copy).
+
+   Aside from better memory bus utilisation, this means that short copies
+   (chunk_t bytes or fewer) will fall straight through the loop
+   without iteration, which will hopefully make the branch prediction more
+   reliable. */
+#ifndef HAVE_CHUNKCOPY
+Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    chunk_t chunk;
+    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    loadchunk(from, &chunk);
+    storechunk(out, &chunk);
+    out += align;
+    from += align;
+    len -= align;
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+    return out;
+}
+#endif
+
+/* Perform short copies until distance can be rewritten as being at least
+   sizeof chunk_t.
+
+   This assumes that it's OK to overwrite at least the first
+   2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
+   This assumption holds because inflate_fast() starts every iteration with at
+   least 258 bytes of output space available (258 being the maximum length
+   output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
+Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+    unsigned char const *from = out - *dist;
+    chunk_t chunk;
+    while (*dist < *len && *dist < sizeof(chunk_t)) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += *dist;
+        *len -= *dist;
+        *dist += *dist;
+    }
+    return out;
+}
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        uint32_t cpy_dist;
+        uint32_t bytes_remaining = sizeof(chunk_t);
+        chunk_t chunk_load;
+        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, buf, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway */
+            *chunk_rem = cpy_dist;
+        }
+
+        return chunk_load;
+}
+#endif
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+   Return OUT + LEN. */
+Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+    /* Debug performance related issues when len < sizeof(uint64_t):
+       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+    Assert(dist > 0, "chunkmemset cannot have a distance 0");
+    /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+    if (len <= 16) {
+        return chunkmemset_ssse3(out, dist, len);
+    }
+#endif
+
+    uint8_t *from = out - dist;
+
+    if (dist == 1) {
+        memset(out, *from, len);
+        return out + len;
+    } else if (dist > sizeof(chunk_t)) {
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+
+    /* TODO: possibly build up a permutation table for this if not an even modulus */
+#ifdef HAVE_CHUNKMEMSET_2
+    if (dist == 2) {
+        chunkmemset_2(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+    if (dist == 4) {
+        chunkmemset_4(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+    if (dist == 8) {
+        chunkmemset_8(from, &chunk_load);
+    } else if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
+    } else
+#endif
+    {
+        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
+    }
+
+    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+    if (chunk_mod == 0) {
+        while (len >= (2 * sizeof(chunk_t))) {
+            storechunk(out, &chunk_load);
+            storechunk(out + sizeof(chunk_t), &chunk_load);
+            out += 2 * sizeof(chunk_t);
+            len -= 2 * sizeof(chunk_t);
+        }
+    }
+
+    /* If we don't have a "dist" length that divides evenly into a vector
+     * register, we can write the whole vector register but we need only
+     * advance by the amount of the whole string that fits in our chunk_t.
+     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+    while (len >= sizeof(chunk_t)) {
+        storechunk(out, &chunk_load);
+        len -= adv_amount;
+        out += adv_amount;
+    }
+
+    if (len) {
+        memcpy(out, &chunk_load, len);
+        out += len;
+    }
+
+    return out;
+}
+
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+#if !defined(UNALIGNED64_OK)
+#  if !defined(UNALIGNED_OK)
+    static const uint32_t align_mask = 7;
+#  else
+    static const uint32_t align_mask = 3;
+#  endif
+#endif
+
+    len = MIN(len, left);
+    uint8_t *from = out - dist;
+#if !defined(UNALIGNED64_OK)
+    while (((uintptr_t)out & align_mask) && (len > 0)) {
+        *out++ = *from++;
+        --len;
+        --left;
+    }
+#endif
+    if (left < (unsigned)(3 * sizeof(chunk_t))) {
+        while (len > 0) {
+            *out++ = *from++;
+            --len;
+        }
+        return out;
+    }
+    if (len)
+        return CHUNKMEMSET(out, dist, len);
+
+    return out;
+}
--- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
@ -0,0 +1,543 @@
+# detect-intrinsics.cmake -- Detect compiler intrinsics support
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(check_acle_compiler_flag)
+    if(MSVC)
+        # Both ARM and ARM64-targeting msvc support intrinsics, but
+        # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
+        if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
+            set(HAVE_ACLE_FLAG TRUE)
+        endif()
+    else()
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            if(NOT NATIVEFLAG)
+                set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
+            endif()
+        endif()
+        # Check whether compiler supports ACLE flag
+        set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "int main() { return 0; }"
+            HAVE_ACLE_FLAG FAIL_REGEX "not supported")
+        if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
+            set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
+            # Check whether compiler supports ACLE flag
+            set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
+            check_c_source_compiles(
+                "int main() { return 0; }"
+                HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
+            set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
+            unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
+        endif()
+        set(CMAKE_REQUIRED_FLAGS)
+    endif()
+endmacro()
+
+macro(check_armv6_compiler_flag)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
+            if(HAVE_MARCH_ARMV6)
+                set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports ARMv6 inline asm
+    set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "unsigned int f(unsigned int a, unsigned int b) {
+            unsigned int c;
+            __asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) );
+            return (int)c;
+        }
+        int main(void) { return f(1,2); }"
+        HAVE_ARMV6_INLINE_ASM
+    )
+    # Check whether compiler supports ARMv6 intrinsics
+    check_c_source_compiles(
+        "#if defined(_MSC_VER)
+        #include <intrin.h>
+        #else
+        #include <arm_acle.h>
+        #endif
+        unsigned int f(unsigned int a, unsigned int b) {
+        #if defined(_MSC_VER)
+            return _arm_uqsub16(a, b);
+        #else
+            return __uqsub16(a, b);
+        #endif
+        }
+        int main(void) { return 0; }"
+        HAVE_ARMV6_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+        else()
+            set(AVX512FLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+            # instruction scheduling unless you specify a reasonable -mtune= target
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            if(NOT MSVC)
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512FLAG "/arch:AVX512")
+    endif()
+    # Check whether compiler supports AVX512 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m512i f(__m512i y) {
+          __m512i x = _mm512_set1_epi8(2);
+          return _mm512_sub_epi8(x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX512_INTRIN
+    )
+
+    # Evidently both GCC and clang were late to implementing these
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __mmask16 f(__mmask16 x) { return _knot_mask16(x); }
+        int main(void) { return 0; }"
+        HAVE_MASK_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512vnni_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
+        else()
+            set(AVX512VNNIFLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            if(NOT MSVC)
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512VNNIFLAG "/arch:AVX512")
+    endif()
+
+    # Check whether compiler supports AVX512vnni intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m512i f(__m512i x, __m512i y) {
+            __m512i z = _mm512_setzero_epi32();
+            return _mm512_dpbusd_epi32(z, x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX512VNNI_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX2FLAG "-mavx2")
+        else()
+            set(AVX2FLAG "/arch:AVX2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX2FLAG "-mavx2")
+        endif()
+    elseif(MSVC)
+        set(AVX2FLAG "/arch:AVX2")
+    endif()
+    # Check whether compiler supports AVX2 intrinics
+    set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m256i f(__m256i x) {
+            const __m256i y = _mm256_set1_epi16(1);
+            return _mm256_subs_epu16(x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_compiler_flag)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports NEON flag
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_M_ARM64) || defined(_M_ARM64EC)
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int main() { return 0; }"
+        NEON_AVAILABLE FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_ld4_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports loading 4 neon vecs into a register range
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int32x4x4_t f(int var[16]) { return vld1q_s32_x4(var); }
+        int main(void) { return 0; }"
+        NEON_HAS_LD4)
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_pclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(PCLMULFLAG "-mpclmul")
+        endif()
+    endif()
+    # Check whether compiler supports PCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
+        set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "#include <immintrin.h>
+            #include <wmmintrin.h>
+            __m128i f(__m128i a, __m128i b) { return _mm_clmulepi64_si128(a, b, 0x10); }
+            int main(void) { return 0; }"
+            HAVE_PCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_PCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_vpclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
+        endif()
+    endif()
+    # Check whether compiler supports VPCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "#include <immintrin.h>
+            #include <wmmintrin.h>
+            __m512i f(__m512i a) {
+                __m512i b = _mm512_setzero_si512();
+                return _mm512_clmulepi64_epi128(a, b, 0x10);
+            }
+            int main(void) { return 0; }"
+            HAVE_VPCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_VPCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_ppc_intrinsics)
+    # Check if compiler supports AltiVec
+    set(CMAKE_REQUIRED_FLAGS "-maltivec ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_ALTIVEC
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_ALTIVEC)
+        set(PPCFLAGS "-maltivec")
+    endif()
+
+    set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_NOVSX
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_NOVSX)
+        set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
+    endif()
+
+    # Check if we have what we need for AltiVec optimizations
+    set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
+        #else
+            return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
+        #endif
+        }"
+        HAVE_VMX
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_power8_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER8FLAG "-mcpu=power8")
+        endif()
+    endif()
+    # Check if we have what we need for POWER8 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE2_ARCH_2_07);
+        #else
+            return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
+        #endif
+        }"
+        HAVE_POWER8_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_rvv_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(RISCVFLAG "-march=rv64gcv")
+        endif()
+    endif()
+    # Check whether compiler supports RVV
+    set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <riscv_vector.h>
+        int main() {
+            return 0;
+        }"
+        HAVE_RVV_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_s390_intrinsics)
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifndef HWCAP_S390_VXRS
+        #define HWCAP_S390_VXRS HWCAP_S390_VX
+        #endif
+        int main() {
+            return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
+        }"
+        HAVE_S390_INTRIN
+    )
+endmacro()
+
+macro(check_power9_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER9FLAG "-mcpu=power9")
+        endif()
+    endif()
+    # Check if we have what we need for POWER9 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE2_ARCH_3_00);
+        #else
+            return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
+        #endif
+        }"
+        HAVE_POWER9_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_sse2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE2FLAG "-msse2")
+        else()
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(MSVC)
+        if(NOT "${ARCH}" MATCHES "x86_64")
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE2FLAG "-msse2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE2 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m128i f(__m128i x, __m128i y) { return _mm_sad_epu8(x, y); }
+        int main(void) { return 0; }"
+        HAVE_SSE2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_ssse3_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSSE3FLAG "-mssse3")
+        else()
+            set(SSSE3FLAG "/arch:SSSE3")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSSE3FLAG "-mssse3")
+        endif()
+    endif()
+    # Check whether compiler supports SSSE3 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m128i f(__m128i u) {
+          __m128i v = _mm_set1_epi32(1);
+          return _mm_hadd_epi32(u, v);
+        }
+        int main(void) { return 0; }"
+        HAVE_SSSE3_INTRIN
+    )
+endmacro()
+
+macro(check_sse42_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE42FLAG "-msse4.2")
+        else()
+            set(SSE42FLAG "/arch:SSE4.2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE42FLAG "-msse4.2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.2 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <nmmintrin.h>
+        unsigned int f(unsigned int a, unsigned int b) { return _mm_crc32_u32(a, b); }
+        int main(void) { return 0; }"
+        HAVE_SSE42_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_vgfma_intrinsics)
+    if(NOT NATIVEFLAG)
+        set(VGFMAFLAG "-march=z13")
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+            set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
+        endif()
+        if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
+        endif()
+    endif()
+    # Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <vecintrin.h>
+        int main(void) {
+            unsigned long long a __attribute__((vector_size(16))) = { 0 };
+            unsigned long long b __attribute__((vector_size(16))) = { 0 };
+            unsigned char c __attribute__((vector_size(16))) = { 0 };
+            c = vec_gfmsum_accum_128(a, b, c);
+            return c[0];
+        }"
+        HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_xsave_intrinsics)
+    if(NOT NATIVEFLAG AND NOT MSVC)
+        set(XSAVEFLAG "-mxsave")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#ifdef _MSC_VER
+        #  include <intrin.h>
+        #else
+        #  include <x86gprintrin.h>
+        #endif
+        unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
+        int main(void) { return 0; }"
+        HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
--- a/3rdparty/zlib-ng/cmake/fallback-macros.cmake
+++ b/3rdparty/zlib-ng/cmake/fallback-macros.cmake
@ -0,0 +1,19 @@
+# fallback-macros.cmake -- CMake fallback macros
+# Copyright (C) 2022 Nathan Moinvaziri
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# CMake less than version 3.5.2
+if(NOT COMMAND add_compile_options)
+    macro(add_compile_options options)
+        string(APPEND CMAKE_C_FLAGS ${options})
+        string(APPEND CMAKE_CXX_FLAGS ${options})
+    endmacro()
+endif()
+
+# CMake less than version 3.14
+if(NOT COMMAND add_link_options)
+    macro(add_link_options options)
+        string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
+        string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
+    endmacro()
+endif()
--- a/3rdparty/zlib-ng/compare256.c
+++ b/3rdparty/zlib-ng/compare256.c
@ -0,0 +1,180 @@
+/* compare256.c -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "fallback_builtins.h"
+
+/* ALIGNED, byte comparison */
+static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_c_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_c
+#define COMPARE256          compare256_c_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_c
+#define COMPARE256          compare256_c_static
+
+#include "match_tpl.h"
+
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+/* 16-bit unaligned integer comparison */
+static inline uint32_t compare256_unaligned_16_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_16_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_16
+#define COMPARE256          compare256_unaligned_16_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_16
+#define COMPARE256          compare256_unaligned_16_static
+
+#include "match_tpl.h"
+
+#ifdef HAVE_BUILTIN_CTZ
+/* 32-bit unaligned integer comparison */
+static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint32_t sv, mv, diff;
+
+        memcpy(&sv, src0, sizeof(sv));
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint32_t match_byte = __builtin_ctz(diff) / 8;
+            return len + match_byte;
+        }
+
+        src0 += 4, src1 += 4, len += 4;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_32_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_32
+#define COMPARE256          compare256_unaligned_32_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_32
+#define COMPARE256          compare256_unaligned_32_static
+
+#include "match_tpl.h"
+
+#endif
+
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+/* UNALIGNED64_OK, 64-bit integer comparison */
+static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint64_t sv, mv, diff;
+
+        memcpy(&sv, src0, sizeof(sv));
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint64_t match_byte = __builtin_ctzll(diff) / 8;
+            return len + (uint32_t)match_byte;
+        }
+
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_64_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_64
+#define COMPARE256          compare256_unaligned_64_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_64
+#define COMPARE256          compare256_unaligned_64_static
+
+#include "match_tpl.h"
+
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/compare256_rle.h
+++ b/3rdparty/zlib-ng/compare256_rle.h
@ -0,0 +1,134 @@
+/* compare256_rle.h -- 256 byte run-length encoding comparison
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "fallback_builtins.h"
+
+typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
+
+/* ALIGNED, byte comparison */
+static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+    } while (len < 256);
+
+    return 256;
+}
+
+#ifdef UNALIGNED_OK
+/* 16-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    uint16_t src0_cmp, src1_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+
+    do {
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+    } while (len < 256);
+
+    return 256;
+}
+
+#ifdef HAVE_BUILTIN_CTZ
+/* 32-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t sv, len = 0;
+    uint16_t src0_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
+
+    do {
+        uint32_t mv, diff;
+
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint32_t match_byte = __builtin_ctz(diff) / 8;
+            return len + match_byte;
+        }
+
+        src1 += 4, len += 4;
+    } while (len < 256);
+
+    return 256;
+}
+
+#endif
+
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+/* 64-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t src0_cmp32, len = 0;
+    uint16_t src0_cmp;
+    uint64_t sv;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
+    sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
+
+    do {
+        uint64_t mv, diff;
+
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint64_t match_byte = __builtin_ctzll(diff) / 8;
+            return len + (uint32_t)match_byte;
+        }
+
+        src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+#endif
+
+#endif
+
--- a/3rdparty/zlib-ng/compress.c
+++ b/3rdparty/zlib-ng/compress.c
@ -0,0 +1,98 @@
+/* compress.c -- compress a memory buffer
+ * Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+/* ===========================================================================
+ *  Architecture-specific hooks.
+ */
+#ifdef S390_DFLTCC_DEFLATE
+#  include "arch/s390/dfltcc_common.h"
+#else
+/* Returns the upper bound on compressed data length based on uncompressed data length, assuming default settings.
+ * Zero means that arch-specific deflation code behaves identically to the regular zlib-ng algorithms. */
+#  define DEFLATE_BOUND_COMPLEN(source_len) 0
+#endif
+
+/* ===========================================================================
+     Compresses the source buffer into the destination buffer. The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer. Upon entry, destLen is the total size of the
+   destination buffer, which must be at least 0.1% larger than sourceLen plus
+   12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+int Z_EXPORT PREFIX(compress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source,
+                        z_uintmax_t sourceLen, int level) {
+    PREFIX3(stream) stream;
+    int err;
+    const unsigned int max = (unsigned int)-1;
+    z_size_t left;
+
+    left = *destLen;
+    *destLen = 0;
+
+    stream.zalloc = NULL;
+    stream.zfree = NULL;
+    stream.opaque = NULL;
+
+    err = PREFIX(deflateInit)(&stream, level);
+    if (err != Z_OK)
+        return err;
+
+    stream.next_out = dest;
+    stream.avail_out = 0;
+    stream.next_in = (z_const unsigned char *)source;
+    stream.avail_in = 0;
+
+    do {
+        if (stream.avail_out == 0) {
+            stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left;
+            left -= stream.avail_out;
+        }
+        if (stream.avail_in == 0) {
+            stream.avail_in = sourceLen > (unsigned long)max ? max : (unsigned int)sourceLen;
+            sourceLen -= stream.avail_in;
+        }
+        err = PREFIX(deflate)(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
+    } while (err == Z_OK);
+
+    *destLen = stream.total_out;
+    PREFIX(deflateEnd)(&stream);
+    return err == Z_STREAM_END ? Z_OK : err;
+}
+
+/* ===========================================================================
+ */
+int Z_EXPORT PREFIX(compress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) {
+    return PREFIX(compress2)(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
+}
+
+/* ===========================================================================
+   If the default memLevel or windowBits for deflateInit() is changed, then
+   this function needs to be updated.
+ */
+z_uintmax_t Z_EXPORT PREFIX(compressBound)(z_uintmax_t sourceLen) {
+    z_uintmax_t complen = DEFLATE_BOUND_COMPLEN(sourceLen);
+
+    if (complen > 0)
+        /* Architecture-specific code provided an upper bound. */
+        return complen + ZLIB_WRAPLEN;
+
+#ifndef NO_QUICK_STRATEGY
+    return sourceLen                       /* The source size itself */
+      + (sourceLen == 0 ? 1 : 0)           /* Always at least one byte for any input */
+      + (sourceLen < 9 ? 1 : 0)            /* One extra byte for lengths less than 9 */
+      + DEFLATE_QUICK_OVERHEAD(sourceLen)  /* Source encoding overhead, padded to next full byte */
+      + DEFLATE_BLOCK_OVERHEAD             /* Deflate block overhead bytes */
+      + ZLIB_WRAPLEN;                      /* zlib wrapper */
+#else
+    return sourceLen + (sourceLen >> 4) + 7 + ZLIB_WRAPLEN;
+#endif
+}
--- a/3rdparty/zlib-ng/cpu_features.c
+++ b/3rdparty/zlib-ng/cpu_features.c
@ -0,0 +1,23 @@
+/* cpu_features.c -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "cpu_features.h"
+#include <string.h>
+
+Z_INTERNAL void cpu_check_features(struct cpu_features *features) {
+    memset(features, 0, sizeof(struct cpu_features));
+#if defined(X86_FEATURES)
+    x86_check_features(&features->x86);
+#elif defined(ARM_FEATURES)
+    arm_check_features(&features->arm);
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+    power_check_features(&features->power);
+#elif defined(S390_FEATURES)
+    s390_check_features(&features->s390);
+#elif defined(RISCV_FEATURES)
+    riscv_check_features(&features->riscv);
+#endif
+}
--- a/3rdparty/zlib-ng/cpu_features.h
+++ b/3rdparty/zlib-ng/cpu_features.h
@ -0,0 +1,303 @@
+/* cpu_features.h -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FEATURES_H_
+#define CPU_FEATURES_H_
+
+#include "adler32_fold.h"
+#include "crc32_fold.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_features.h"
+#  include "fallback_builtins.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_features.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_features.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_features.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_features.h"
+#endif
+
+struct cpu_features {
+#if defined(X86_FEATURES)
+    struct x86_cpu_features x86;
+#elif defined(ARM_FEATURES)
+    struct arm_cpu_features arm;
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+    struct power_cpu_features power;
+#elif defined(S390_FEATURES)
+    struct s390_cpu_features s390;
+#elif defined(RISCV_FEATURES)
+    struct riscv_cpu_features riscv;
+#else
+    char empty;
+#endif
+};
+
+extern void cpu_check_features(struct cpu_features *features);
+
+/* adler32 */
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+
+extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+#ifdef ARM_NEON
+extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef PPC_VMX
+extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_SSSE3
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX2
+extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX512
+extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef POWER8_VSX
+extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+
+/* adler32 folding */
+#ifdef RISCV_RVV
+extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_SSE42
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+/* CRC32 folding */
+#ifdef X86_PCLMULQDQ_CRC
+extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
+extern void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+extern void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_SSSE3
+extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_AVX2
+extern uint32_t chunksize_avx2(void);
+extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef POWER8_VSX
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t chunksize_rvv(void);
+extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
+#ifdef ZLIB_COMPAT
+typedef struct z_stream_s z_stream;
+#else
+typedef struct zng_stream_s zng_stream;
+#endif
+
+/* inflate fast loop */
+extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+#ifdef X86_SSE2
+extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef X86_SSSE3
+extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef X86_AVX2
+extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef ARM_NEON
+extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef POWER8_VSX
+extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef RISCV_RVV
+extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+/* CRC32 */
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+#ifdef ARM_ACLE
+extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#elif defined(POWER8_VSX)
+extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+#elif defined(S390_CRC32_VX)
+extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+/* compare256 */
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+
+extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#ifdef HAVE_BUILTIN_CTZ
+extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+#endif
+#ifdef POWER9
+extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+#endif
+
+#ifdef DEFLATE_H_
+/* insert_string */
+extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
+#ifdef X86_SSE42
+extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
+#elif defined(ARM_ACLE)
+extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
+#endif
+
+/* longest_match */
+extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#ifdef HAVE_BUILTIN_CTZ
+extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef POWER9
+extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+#endif
+
+/* longest_match_slow */
+extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef POWER9
+extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+#endif
+
+/* quick_insert_string */
+extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
+#ifdef X86_SSE42
+extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
+#elif defined(ARM_ACLE)
+extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
+#endif
+
+/* slide_hash */
+typedef void (*slide_hash_func)(deflate_state *s);
+
+#ifdef X86_SSE2
+extern void slide_hash_sse2(deflate_state *s);
+#endif
+#if defined(ARM_SIMD)
+extern void slide_hash_armv6(deflate_state *s);
+#endif
+#if defined(ARM_NEON)
+extern void slide_hash_neon(deflate_state *s);
+#endif
+#if defined(PPC_VMX)
+extern void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX)
+extern void slide_hash_power8(deflate_state *s);
+#endif
+#if defined(RISCV_RVV)
+extern void slide_hash_rvv(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+extern void slide_hash_avx2(deflate_state *s);
+#endif
+
+/* update_hash */
+extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
+#ifdef X86_SSE42
+extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
+#elif defined(ARM_ACLE)
+extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
+#endif
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/crc32_braid.c
+++ b/3rdparty/zlib-ng/crc32_braid.c
@ -0,0 +1,267 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "functable.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+
+/* ========================================================================= */
+
+const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
+    return (const uint32_t *)crc_table;
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return functable.crc32(crc, buf, len);
+}
+#endif
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
+    return PREFIX(crc32_z)(crc, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+
+/*
+  A CRC of a message is computed on N braids of words in the message, where
+  each word consists of W bytes (4 or 8). If N is 3, for example, then three
+  running sparse CRCs are calculated respectively on each braid, at these
+  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+  This is done starting at a word boundary, and continues until as many blocks
+  of N * W bytes as are available have been processed. The results are combined
+  into a single CRC at the end. For this code, N must be in the range 1..6 and
+  W must be 4 or 8. The upper limit on N can be increased if desired by adding
+  more #if blocks, extending the patterns apparent in the code. In addition,
+  crc32 tables would need to be regenerated, if the maximum N value is increased.
+
+  N and W are chosen empirically by benchmarking the execution time on a given
+  processor. The choices for N and W below were based on testing on Intel Kaby
+  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
+  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
+  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
+  They were all tested with either gcc or clang, all using the -O3 optimization
+  level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define ZSWAPWORD(word) (word)
+#  define BRAID_TABLE crc_braid_table
+#elif BYTE_ORDER == BIG_ENDIAN
+#  if W == 8
+#    define ZSWAPWORD(word) ZSWAP64(word)
+#  elif W == 4
+#    define ZSWAPWORD(word) ZSWAP32(word)
+#  endif
+#  define BRAID_TABLE crc_braid_big_table
+#else
+#  error "No endian defined"
+#endif
+#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+#ifdef W
+/*
+  Return the CRC of the W bytes in the word_t data, taking the
+  least-significant byte of the word as the first byte of data, without any pre
+  or post conditioning. This is used to combine the CRCs of each braid.
+ */
+#if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data >> 8) ^ crc_table[data & 0xff];
+    return (uint32_t)data;
+}
+#elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data << 8) ^
+            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
+    return data;
+}
+#endif /* BYTE_ORDER */
+
+#endif /* W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+
+    /* Pre-condition the CRC */
+    c = (~crc) & 0xffffffff;
+
+#ifdef W
+    /* If provided enough bytes, do a braided CRC calculation. */
+    if (len >= N * W + W - 1) {
+        size_t blks;
+        z_word_t const *words;
+        int k;
+
+        /* Compute the CRC up to a z_word_t boundary. */
+        while (len && ((uintptr_t)buf & (W - 1)) != 0) {
+            len--;
+            DO1;
+        }
+
+        /* Compute the CRC on as many N z_word_t blocks as are available. */
+        blks = len / (N * W);
+        len -= blks * N * W;
+        words = (z_word_t const *)buf;
+
+        z_word_t crc0, word0, comb;
+#if N > 1
+        z_word_t crc1, word1;
+#if N > 2
+        z_word_t crc2, word2;
+#if N > 3
+        z_word_t crc3, word3;
+#if N > 4
+        z_word_t crc4, word4;
+#if N > 5
+        z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Initialize the CRC for each braid. */
+        crc0 = ZSWAPWORD(c);
+#if N > 1
+        crc1 = 0;
+#if N > 2
+        crc2 = 0;
+#if N > 3
+        crc3 = 0;
+#if N > 4
+        crc4 = 0;
+#if N > 5
+        crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+        while (--blks) {
+            /* Load the word for each braid into registers. */
+            word0 = crc0 ^ words[0];
+#if N > 1
+            word1 = crc1 ^ words[1];
+#if N > 2
+            word2 = crc2 ^ words[2];
+#if N > 3
+            word3 = crc3 ^ words[3];
+#if N > 4
+            word4 = crc4 ^ words[4];
+#if N > 5
+            word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+            words += N;
+
+            /* Compute and update the CRC for each word. The loop should get unrolled. */
+            crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if N > 1
+            crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if N > 2
+            crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if N > 3
+            crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if N > 4
+            crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if N > 5
+            crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            for (k = 1; k < W; k++) {
+                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if N > 1
+                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if N > 2
+                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if N > 3
+                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if N > 4
+                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if N > 5
+                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            }
+        }
+
+        /* Process the last block, combining the CRCs of the N braids at the same time. */
+        comb = crc_word(crc0 ^ words[0]);
+#if N > 1
+        comb = crc_word(crc1 ^ words[1] ^ comb);
+#if N > 2
+        comb = crc_word(crc2 ^ words[2] ^ comb);
+#if N > 3
+        comb = crc_word(crc3 ^ words[3] ^ comb);
+#if N > 4
+        comb = crc_word(crc4 ^ words[4] ^ comb);
+#if N > 5
+        comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+        words += N;
+        c = ZSWAPWORD(comb);
+
+        /* Update the pointer to the remaining bytes to process. */
+        buf = (const unsigned char *)words;
+    }
+
+#endif /* W */
+
+    /* Complete the computation of the CRC on any remaining bytes. */
+    while (len >= 8) {
+        len -= 8;
+        DO8;
+    }
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
--- a/3rdparty/zlib-ng/crc32_braid_comb.c
+++ b/3rdparty/zlib-ng/crc32_braid_comb.c
@ -0,0 +1,57 @@
+/* crc32_braid_comb.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_braid_comb_p.h"
+
+/* ========================================================================= */
+static uint32_t crc32_combine_(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
+    return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
+}
+static uint32_t crc32_combine_gen_(z_off64_t len2) {
+     return x2nmodp(len2, 3);
+}
+static uint32_t crc32_combine_op_(uint32_t crc1, uint32_t crc2, const uint32_t op) {
+    return multmodp(op, crc1) ^ crc2;
+}
+
+/* ========================================================================= */
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off_t len2) {
+    return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
+}
+unsigned long Z_EXPORT PREFIX4(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off64_t len2) {
+    return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
+}
+unsigned long Z_EXPORT PREFIX(crc32_combine_gen)(z_off_t len2) {
+    return crc32_combine_gen_(len2);
+}
+unsigned long Z_EXPORT PREFIX4(crc32_combine_gen)(z_off64_t len2) {
+    return crc32_combine_gen_(len2);
+}
+unsigned long Z_EXPORT PREFIX(crc32_combine_op)(unsigned long crc1, unsigned long crc2, const unsigned long op) {
+    return (unsigned long)crc32_combine_op_((uint32_t)crc1, (uint32_t)crc2, (uint32_t)op);
+}
+#else
+uint32_t Z_EXPORT PREFIX4(crc32_combine)(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
+    return crc32_combine_(crc1, crc2, len2);
+}
+uint32_t Z_EXPORT PREFIX(crc32_combine_gen)(z_off64_t len2) {
+    return crc32_combine_gen_(len2);
+}
+uint32_t Z_EXPORT PREFIX(crc32_combine_op)(uint32_t crc1, uint32_t crc2, const uint32_t op) {
+    return crc32_combine_op_(crc1, crc2, op);
+}
+#endif
+
+/* ========================================================================= */
--- a/3rdparty/zlib-ng/crc32_braid_comb_p.h
+++ b/3rdparty/zlib-ng/crc32_braid_comb_p.h
@ -0,0 +1,42 @@
+#ifndef CRC32_BRAID_COMB_P_H_
+#define CRC32_BRAID_COMB_P_H_
+
+/*
+  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
+  reflected. For speed, this requires that a not be zero.
+ */
+static uint32_t multmodp(uint32_t a, uint32_t b) {
+    uint32_t m, p;
+
+    m = (uint32_t)1 << 31;
+    p = 0;
+    for (;;) {
+        if (a & m) {
+            p ^= b;
+            if ((a & (m - 1)) == 0)
+                break;
+        }
+        m >>= 1;
+        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
+    }
+    return p;
+}
+
+/*
+  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
+  initialized.
+ */
+static uint32_t x2nmodp(z_off64_t n, unsigned k) {
+    uint32_t p;
+
+    p = (uint32_t)1 << 31;           /* x^0 == 1 */
+    while (n) {
+        if (n & 1)
+            p = multmodp(x2n_table[k & 31], p);
+        n >>= 1;
+        k++;
+    }
+    return p;
+}
+
+#endif /* CRC32_BRAID_COMB_P_H_ */
--- a/3rdparty/zlib-ng/crc32_braid_p.h
+++ b/3rdparty/zlib-ng/crc32_braid_p.h
@ -0,0 +1,50 @@
+#ifndef CRC32_BRAID_P_H_
+#define CRC32_BRAID_P_H_
+
+#include "zbuild.h"
+#include "zendian.h"
+
+/* Define N */
+#ifdef Z_TESTN
+#  define N Z_TESTN
+#else
+#  define N 5
+#endif
+#if N < 1 || N > 6
+#  error N must be in 1..6
+#endif
+
+/*
+  Define W and the associated z_word_t type. If W is not defined, then a
+  braided calculation is not used, and the associated tables and code are not
+  compiled.
+ */
+#ifdef Z_TESTW
+#  if Z_TESTW-1 != -1
+#    define W Z_TESTW
+#  endif
+#else
+#  ifndef W
+#    if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
+#      define W 8
+#    else
+#      define W 4
+#    endif
+#  endif
+#endif
+#ifdef W
+#  if W == 8
+     typedef uint64_t z_word_t;
+#  else
+#    undef W
+#    define W 4
+     typedef uint32_t z_word_t;
+#  endif
+#endif
+
+/* CRC polynomial. */
+#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
+
+extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+#endif /* CRC32_BRAID_P_H_ */
--- a/3rdparty/zlib-ng/crc32_braid_tbl.h
+++ b/3rdparty/zlib-ng/crc32_braid_tbl.h
--- a/3rdparty/zlib-ng/crc32_fold.c
+++ b/3rdparty/zlib-ng/crc32_fold.c
@ -0,0 +1,33 @@
+/* crc32_fold.c -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "functable.h"
+
+#include "crc32_fold.h"
+
+#include <limits.h>
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
+    crc->value = CRC32_INITIAL_VALUE;
+    return crc->value;
+}
+
+Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc->value = functable.crc32(crc->value, src, len);
+    memcpy(dst, src, len);
+}
+
+Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+    /* Note: while this is basically the same thing as the vanilla CRC function, we still need
+     * a functable entry for it so that we can generically dispatch to this function with the
+     * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
+     * init_crc is an unused argument in this context */
+    Z_UNUSED(init_crc);
+    crc->value = functable.crc32(crc->value, src, len);
+}
+
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
+    return crc->value;
+}
--- a/3rdparty/zlib-ng/crc32_fold.h
+++ b/3rdparty/zlib-ng/crc32_fold.h
@ -0,0 +1,21 @@
+/* crc32_fold.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_FOLD_H_
+#define CRC32_FOLD_H_
+
+#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
+/* sizeof(__m128i) * (4 folds) */
+
+typedef struct crc32_fold_s {
+    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
+    uint32_t value;
+} crc32_fold;
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+#endif
--- a/3rdparty/zlib-ng/deflate.c
+++ b/3rdparty/zlib-ng/deflate.c
--- a/3rdparty/zlib-ng/deflate.h
+++ b/3rdparty/zlib-ng/deflate.h
@ -0,0 +1,408 @@
+#ifndef DEFLATE_H_
+#define DEFLATE_H_
+/* deflate.h -- internal compression state
+ * Copyright (C) 1995-2016 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#include "zutil.h"
+#include "zendian.h"
+#include "adler32_fold.h"
+#include "crc32_fold.h"
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+   trailer creation by deflate().  NO_GZIP would be used to avoid linking in
+   the crc code when it is not needed.  For shared libraries, gzip encoding
+   should be left enabled. */
+#ifndef NO_GZIP
+#  define GZIP
+#endif
+
+/* ===========================================================================
+ * Internal compression state.
+ */
+
+#define LENGTH_CODES 29
+/* number of length codes, not counting the special END_BLOCK code */
+
+#define LITERALS  256
+/* number of literal bytes 0..255 */
+
+#define L_CODES (LITERALS+1+LENGTH_CODES)
+/* number of Literal or Length codes, including the END_BLOCK code */
+
+#define D_CODES   30
+/* number of distance codes */
+
+#define BL_CODES  19
+/* number of codes used to transfer the bit lengths */
+
+#define HEAP_SIZE (2*L_CODES+1)
+/* maximum heap size */
+
+#define BIT_BUF_SIZE 64
+/* size of bit buffer in bi_buf */
+
+#define END_BLOCK 256
+/* end of block literal code */
+
+#define INIT_STATE      1    /* zlib header -> BUSY_STATE */
+#ifdef GZIP
+#  define GZIP_STATE    4    /* gzip header -> BUSY_STATE | EXTRA_STATE */
+#  define EXTRA_STATE   5    /* gzip extra block -> NAME_STATE */
+#  define NAME_STATE    6    /* gzip file name -> COMMENT_STATE */
+#  define COMMENT_STATE 7    /* gzip comment -> HCRC_STATE */
+#  define HCRC_STATE    8    /* gzip header CRC -> BUSY_STATE */
+#endif
+#define BUSY_STATE      2    /* deflate -> FINISH_STATE */
+#define FINISH_STATE    3    /* stream complete */
+#ifdef GZIP
+#  define MAX_STATE     HCRC_STATE
+#else
+#  define MAX_STATE     FINISH_STATE
+#endif
+/* Stream status */
+
+#define HASH_BITS    16u           /* log2(HASH_SIZE) */
+#ifndef HASH_SIZE
+#  define HASH_SIZE 65536u         /* number of elements in hash table */
+#endif
+#define HASH_MASK (HASH_SIZE - 1u) /* HASH_SIZE-1 */
+
+
+/* Data structure describing a single value and its code string. */
+typedef struct ct_data_s {
+    union {
+        uint16_t  freq;       /* frequency count */
+        uint16_t  code;       /* bit string */
+    } fc;
+    union {
+        uint16_t  dad;        /* father node in Huffman tree */
+        uint16_t  len;        /* length of bit string */
+    } dl;
+} ct_data;
+
+#define Freq fc.freq
+#define Code fc.code
+#define Dad  dl.dad
+#define Len  dl.len
+
+typedef struct static_tree_desc_s  static_tree_desc;
+
+typedef struct tree_desc_s {
+    ct_data                *dyn_tree;  /* the dynamic tree */
+    int                    max_code;   /* largest code with non zero frequency */
+    const static_tree_desc *stat_desc; /* the corresponding static tree */
+} tree_desc;
+
+typedef uint16_t Pos;
+
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables.
+ */
+/* Type definitions for hash callbacks */
+typedef struct internal_state deflate_state;
+
+typedef uint32_t (* update_hash_cb)        (deflate_state *const s, uint32_t h, uint32_t val);
+typedef void     (* insert_string_cb)      (deflate_state *const s, uint32_t str, uint32_t count);
+typedef Pos      (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
+
+struct internal_state {
+    PREFIX3(stream)      *strm;            /* pointer back to this zlib stream */
+    unsigned char        *pending_buf;     /* output still pending */
+    unsigned char        *pending_out;     /* next pending byte to output to the stream */
+    uint32_t             pending_buf_size; /* size of pending_buf */
+    uint32_t             pending;          /* nb of bytes in the pending buffer */
+    int                  wrap;             /* bit 0 true for zlib, bit 1 true for gzip */
+    uint32_t             gzindex;          /* where in extra, name, or comment */
+    PREFIX(gz_headerp)   gzhead;           /* gzip header information to write */
+    int                  status;           /* as the name implies */
+    int                  last_flush;       /* value of flush param for previous deflate call */
+    int                  reproducible;     /* Whether reproducible compression results are required. */
+
+    int block_open;
+    /* Whether or not a block is currently open for the QUICK deflation scheme.
+     * This is set to 1 if there is an active block, or 0 if the block was just closed.
+     */
+
+                /* used by deflate.c: */
+
+    unsigned int  w_size;            /* LZ77 window size (32K by default) */
+    unsigned int  w_bits;            /* log2(w_size)  (8..16) */
+    unsigned int  w_mask;            /* w_size - 1 */
+    unsigned int  lookahead;         /* number of valid bytes ahead in window */
+
+    unsigned int high_water;
+    /* High water mark offset in window for initialized bytes -- bytes above
+     * this are set to zero in order to avoid memory check warnings when
+     * longest match routines access bytes past the input.  This is then
+     * updated to the new high water mark.
+     */
+
+    unsigned int window_size;
+    /* Actual size of window: 2*wSize, except when the user input buffer
+     * is directly used as sliding window.
+     */
+
+    unsigned char *window;
+    /* Sliding window. Input bytes are read into the second half of the window,
+     * and move to the first half later to keep a dictionary of at least wSize
+     * bytes. With this organization, matches are limited to a distance of
+     * wSize-STD_MAX_MATCH bytes, but this ensures that IO is always
+     * performed with a length multiple of the block size. Also, it limits
+     * the window size to 64K, which is quite useful on MSDOS.
+     * To do: use the user input buffer as sliding window.
+     */
+
+    Pos *prev;
+    /* Link to older string with same hash index. To limit the size of this
+     * array to 64K, this link is maintained only for the last 32K strings.
+     * An index in this array is thus a window index modulo 32K.
+     */
+
+    Pos *head; /* Heads of the hash chains or 0. */
+
+    uint32_t ins_h; /* hash index of string to be inserted */
+
+    int block_start;
+    /* Window position at the beginning of the current output block. Gets
+     * negative when the window is moved backwards.
+     */
+
+    unsigned int match_length;       /* length of best match */
+    Pos          prev_match;         /* previous match */
+    int          match_available;    /* set if previous match exists */
+    unsigned int strstart;           /* start of string to insert */
+    unsigned int match_start;        /* start of matching string */
+
+    unsigned int prev_length;
+    /* Length of the best match at previous step. Matches not greater than this
+     * are discarded. This is used in the lazy match evaluation.
+     */
+
+    unsigned int max_chain_length;
+    /* To speed up deflation, hash chains are never searched beyond this length.
+     * A higher limit improves compression ratio but degrades the speed.
+     */
+
+    unsigned int max_lazy_match;
+    /* Attempt to find a better match only when the current match is strictly smaller
+     * than this value. This mechanism is used only for compression levels >= 4.
+     */
+#   define max_insert_length  max_lazy_match
+    /* Insert new strings in the hash table only if the match length is not
+     * greater than this length. This saves time but degrades compression.
+     * max_insert_length is used only for compression levels <= 3.
+     */
+
+    update_hash_cb          update_hash;
+    insert_string_cb        insert_string;
+    quick_insert_string_cb  quick_insert_string;
+    /* Hash function callbacks that can be configured depending on the deflate
+     * algorithm being used */
+
+    int level;    /* compression level (1..9) */
+    int strategy; /* favor or force Huffman coding*/
+
+    unsigned int good_match;
+    /* Use a faster search when the previous match is longer than this */
+
+    int nice_match; /* Stop searching when current match exceeds this */
+
+    struct crc32_fold_s ALIGNED_(16) crc_fold;
+
+                /* used by trees.c: */
+    /* Didn't use ct_data typedef below to suppress compiler warning */
+    struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
+    struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
+    struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
+
+    struct tree_desc_s l_desc;               /* desc. for literal tree */
+    struct tree_desc_s d_desc;               /* desc. for distance tree */
+    struct tree_desc_s bl_desc;              /* desc. for bit length tree */
+
+    uint16_t bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    int heap[2*L_CODES+1];      /* heap used to build the Huffman trees */
+    int heap_len;               /* number of elements in the heap */
+    int heap_max;               /* element of largest frequency */
+    /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
+     * The same heap array is used to build all trees.
+     */
+
+    unsigned char depth[2*L_CODES+1];
+    /* Depth of each subtree used as tie breaker for trees of equal frequency
+     */
+
+    unsigned int  lit_bufsize;
+    /* Size of match buffer for literals/lengths.  There are 4 reasons for
+     * limiting lit_bufsize to 64K:
+     *   - frequencies can be kept in 16 bit counters
+     *   - if compression is not successful for the first block, all input
+     *     data is still in the window so we can still emit a stored block even
+     *     when input comes from standard input.  (This can also be done for
+     *     all blocks if lit_bufsize is not greater than 32K.)
+     *   - if compression is not successful for a file smaller than 64K, we can
+     *     even emit a stored file instead of a stored block (saving 5 bytes).
+     *     This is applicable only for zip (not gzip or zlib).
+     *   - creating new Huffman trees less frequently may not provide fast
+     *     adaptation to changes in the input data statistics. (Take for
+     *     example a binary file with poorly compressible code followed by
+     *     a highly compressible string table.) Smaller buffer sizes give
+     *     fast adaptation but have of course the overhead of transmitting
+     *     trees more frequently.
+     *   - I can't count above 4
+     */
+
+    unsigned char *sym_buf;       /* buffer for distances and literals/lengths */
+    unsigned int sym_next;        /* running index in sym_buf */
+    unsigned int sym_end;         /* symbol table full when sym_next reaches this */
+
+    unsigned long opt_len;        /* bit length of current block with optimal trees */
+    unsigned long static_len;     /* bit length of current block with static trees */
+    unsigned int matches;         /* number of string matches in current block */
+    unsigned int insert;          /* bytes at end of window left to insert */
+
+    /* compressed_len and bits_sent are only used if ZLIB_DEBUG is defined */
+    unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
+    unsigned long bits_sent;      /* bit length of compressed data sent mod 2^32 */
+
+    /* Reserved for future use and alignment purposes */
+    char *reserved_p;
+
+    uint64_t bi_buf;
+    /* Output buffer. bits are inserted starting at the bottom (least significant bits). */
+
+    int32_t bi_valid;
+    /* Number of valid bits in bi_buf.  All bits above the last valid bit are always zero. */
+
+    /* Reserved for future use and alignment purposes */
+    int32_t reserved[11];
+} ALIGNED_(8);
+
+typedef enum {
+    need_more,      /* block not completed, need more input or more output */
+    block_done,     /* block flush performed */
+    finish_started, /* finish started, need only more output at next deflate */
+    finish_done     /* finish done, accept no more input or output */
+} block_state;
+
+/* Output a byte on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+#define put_byte(s, c) { \
+    s->pending_buf[s->pending++] = (unsigned char)(c); \
+}
+
+/* ===========================================================================
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_short(deflate_state *s, uint16_t w) {
+#if BYTE_ORDER == BIG_ENDIAN
+    w = ZSWAP16(w);
+#endif
+    memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
+    s->pending += 2;
+}
+
+/* ===========================================================================
+ * Output a short MSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_short_msb(deflate_state *s, uint16_t w) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    w = ZSWAP16(w);
+#endif
+    memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
+    s->pending += 2;
+}
+
+/* ===========================================================================
+ * Output a 32-bit unsigned int LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint32(deflate_state *s, uint32_t dw) {
+#if BYTE_ORDER == BIG_ENDIAN
+    dw = ZSWAP32(dw);
+#endif
+    memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
+    s->pending += 4;
+}
+
+/* ===========================================================================
+ * Output a 32-bit unsigned int MSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    dw = ZSWAP32(dw);
+#endif
+    memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
+    s->pending += 4;
+}
+
+/* ===========================================================================
+ * Output a 64-bit unsigned int LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint64(deflate_state *s, uint64_t lld) {
+#if BYTE_ORDER == BIG_ENDIAN
+    lld = ZSWAP64(lld);
+#endif
+    memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld));
+    s->pending += 8;
+}
+
+#define MIN_LOOKAHEAD (STD_MAX_MATCH + STD_MIN_MATCH + 1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the STD_MIN_MATCH+1.
+ */
+
+#define MAX_DIST(s)  ((s)->w_size - MIN_LOOKAHEAD)
+/* In order to simplify the code, particularly on 16 bit machines, match
+ * distances are limited to MAX_DIST instead of WSIZE.
+ */
+
+#define WIN_INIT STD_MAX_MATCH
+/* Number of bytes after end of data in window to initialize in order to avoid
+   memory checker errors from longest match routines */
+
+
+void Z_INTERNAL PREFIX(fill_window)(deflate_state *s);
+void Z_INTERNAL slide_hash_c(deflate_state *s);
+
+        /* in trees.c */
+void Z_INTERNAL zng_tr_init(deflate_state *s);
+void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
+void Z_INTERNAL zng_tr_flush_bits(deflate_state *s);
+void Z_INTERNAL zng_tr_align(deflate_state *s);
+void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
+uint16_t Z_INTERNAL PREFIX(bi_reverse)(unsigned code, int len);
+void Z_INTERNAL PREFIX(flush_pending)(PREFIX3(streamp) strm);
+#define d_code(dist) ((dist) < 256 ? zng_dist_code[dist] : zng_dist_code[256+((dist)>>7)])
+/* Mapping from a distance to a distance code. dist is the distance - 1 and
+ * must not have side effects. zng_dist_code[256] and zng_dist_code[257] are never
+ * used.
+ */
+
+/* Bit buffer and compress bits calculation debugging */
+#ifdef ZLIB_DEBUG
+#  define cmpr_bits_add(s, len)     s->compressed_len += (len)
+#  define cmpr_bits_align(s)        s->compressed_len = (s->compressed_len + 7) & ~7L
+#  define sent_bits_add(s, bits)    s->bits_sent += (bits)
+#  define sent_bits_align(s)        s->bits_sent = (s->bits_sent + 7) & ~7L
+#else
+#  define cmpr_bits_add(s, len)     Z_UNUSED(len)
+#  define cmpr_bits_align(s)
+#  define sent_bits_add(s, bits)    Z_UNUSED(bits)
+#  define sent_bits_align(s)
+#endif
+
+#endif /* DEFLATE_H_ */
--- a/3rdparty/zlib-ng/deflate_fast.c
+++ b/3rdparty/zlib-ng/deflate_fast.c
@ -0,0 +1,102 @@
+/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Compress as much as possible from the input stream, return the current
+ * block state.
+ * This function does not perform lazy evaluation of matches and inserts
+ * new strings in the dictionary only for unmatched strings or for short
+ * matches. It is used only for the fast compression options.
+ */
+Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
+    Pos hash_head;        /* head of the hash chain */
+    int bflush = 0;       /* set if current block must be flushed */
+    int64_t dist;
+    uint32_t match_len = 0;
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        if (s->lookahead >= WANT_MIN_MATCH) {
+            hash_head = functable.quick_insert_string(s, s->strstart);
+            dist = (int64_t)s->strstart - hash_head;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match length < WANT_MIN_MATCH
+             */
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                match_len = functable.longest_match(s, hash_head);
+                /* longest_match() sets match_start */
+            }
+        }
+
+        if (match_len >= WANT_MIN_MATCH) {
+            check_match(s, s->strstart, s->match_start, match_len);
+
+            bflush = zng_tr_tally_dist(s, s->strstart - s->match_start, match_len - STD_MIN_MATCH);
+
+            s->lookahead -= match_len;
+
+            /* Insert new strings in the hash table only if the match length
+             * is not too large. This saves time but degrades compression.
+             */
+            if (match_len <= s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
+                match_len--; /* string at strstart already in table */
+                s->strstart++;
+
+                functable.insert_string(s, s->strstart, match_len);
+                s->strstart += match_len;
+            } else {
+                s->strstart += match_len;
+                functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
+
+                /* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
+                 * matter since it will be recomputed at next deflate call.
+                 */
+            }
+            match_len = 0;
+        } else {
+            /* No match, output a literal byte */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (UNLIKELY(bflush))
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(flush == Z_FINISH)) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
--- a/3rdparty/zlib-ng/deflate_huff.c
+++ b/3rdparty/zlib-ng/deflate_huff.c
@ -0,0 +1,45 @@
+/* deflate_huff.c -- compress data using huffman encoding only strategy
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * For Z_HUFFMAN_ONLY, do not look for matches.  Do not maintain a hash table.
+ * (It will be regenerated if this run of deflate switches away from Huffman.)
+ */
+Z_INTERNAL block_state deflate_huff(deflate_state *s, int flush) {
+    int bflush = 0;         /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we have a literal to write. */
+        if (s->lookahead == 0) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead == 0) {
+                if (flush == Z_NO_FLUSH)
+                    return need_more;
+                break;      /* flush the current block */
+            }
+        }
+
+        /* Output a literal byte */
+        bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+        s->lookahead--;
+        s->strstart++;
+        if (bflush)
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->sym_next)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
--- a/3rdparty/zlib-ng/deflate_medium.c
+++ b/3rdparty/zlib-ng/deflate_medium.c
@ -0,0 +1,293 @@
+/* deflate_medium.c -- The deflate_medium deflate strategy
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Arjan van de Ven    <arjan@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef NO_MEDIUM_STRATEGY
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+struct match {
+    uint16_t match_start;
+    uint16_t match_length;
+    uint16_t strstart;
+    uint16_t orgstart;
+};
+
+static int emit_match(deflate_state *s, struct match match) {
+    int bflush = 0;
+
+    /* matches that are not long enough we need to emit as literals */
+    if (match.match_length < WANT_MIN_MATCH) {
+        while (match.match_length) {
+            bflush += zng_tr_tally_lit(s, s->window[match.strstart]);
+            s->lookahead--;
+            match.strstart++;
+            match.match_length--;
+        }
+        return bflush;
+    }
+
+    check_match(s, match.strstart, match.match_start, match.match_length);
+
+    bflush += zng_tr_tally_dist(s, match.strstart - match.match_start, match.match_length - STD_MIN_MATCH);
+
+    s->lookahead -= match.match_length;
+    return bflush;
+}
+
+static void insert_match(deflate_state *s, struct match match) {
+    if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
+        return;
+
+    /* matches that are not long enough we need to emit as literals */
+    if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
+        match.strstart++;
+        match.match_length--;
+        if (UNLIKELY(match.match_length > 0)) {
+            if (match.strstart >= match.orgstart) {
+                if (match.strstart + match.match_length - 1 >= match.orgstart) {
+                    functable.insert_string(s, match.strstart, match.match_length);
+                } else {
+                    functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+                }
+                match.strstart += match.match_length;
+                match.match_length = 0;
+            }
+        }
+        return;
+    }
+
+    /* Insert new strings in the hash table only if the match length
+     * is not too large. This saves time but degrades compression.
+     */
+    if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
+        match.match_length--; /* string at strstart already in table */
+        match.strstart++;
+
+        if (LIKELY(match.strstart >= match.orgstart)) {
+            if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
+                functable.insert_string(s, match.strstart, match.match_length);
+            } else {
+                functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+            }
+        } else if (match.orgstart < match.strstart + match.match_length) {
+            functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
+        }
+        match.strstart += match.match_length;
+        match.match_length = 0;
+    } else {
+        match.strstart += match.match_length;
+        match.match_length = 0;
+
+        if (match.strstart >= (STD_MIN_MATCH - 2))
+            functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
+
+        /* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
+         * matter since it will be recomputed at next deflate call.
+         */
+    }
+}
+
+static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
+    Pos limit;
+    unsigned char *match, *orig;
+    int changed = 0;
+    struct match c, n;
+    /* step zero: sanity checks */
+
+    if (current->match_length <= 1)
+        return;
+
+    if (UNLIKELY(current->match_length > 1 + next->match_start))
+        return;
+
+    if (UNLIKELY(current->match_length > 1 + next->strstart))
+        return;
+
+    match = s->window - current->match_length + 1 + next->match_start;
+    orig  = s->window - current->match_length + 1 + next->strstart;
+
+    /* quick exit check.. if this fails then don't bother with anything else */
+    if (LIKELY(*match != *orig))
+        return;
+
+    c = *current;
+    n = *next;
+
+    /* step one: try to move the "next" match to the left as much as possible */
+    limit = next->strstart > MAX_DIST(s) ? next->strstart - (Pos)MAX_DIST(s) : 0;
+
+    match = s->window + n.match_start - 1;
+    orig = s->window + n.strstart - 1;
+
+    while (*match == *orig) {
+        if (UNLIKELY(c.match_length < 1))
+            break;
+        if (UNLIKELY(n.strstart <= limit))
+            break;
+        if (UNLIKELY(n.match_length >= 256))
+            break;
+        if (UNLIKELY(n.match_start <= 1))
+            break;
+
+        n.strstart--;
+        n.match_start--;
+        n.match_length++;
+        c.match_length--;
+        match--;
+        orig--;
+        changed++;
+    }
+
+    if (!changed)
+        return;
+
+    if (c.match_length <= 1 && n.match_length != 2) {
+        n.orgstart++;
+        *current = c;
+        *next = n;
+    } else {
+        return;
+    }
+}
+
+Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
+    /* Align the first struct to start on a new cacheline, this allows us to fit both structs in one cacheline */
+    ALIGNED_(16) struct match current_match;
+                 struct match next_match;
+
+    /* For levels below 5, don't check the next position for a better match */
+    int early_exit = s->level < 5;
+
+    memset(&current_match, 0, sizeof(struct match));
+    memset(&next_match, 0, sizeof(struct match));
+
+    for (;;) {
+        Pos hash_head = 0;    /* head of the hash chain */
+        int bflush = 0;       /* set if current block must be flushed */
+        int64_t dist;
+
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next current_match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+            next_match.match_length = 0;
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+
+        /* If we already have a future match from a previous round, just use that */
+        if (!early_exit && next_match.match_length > 0) {
+            current_match = next_match;
+            next_match.match_length = 0;
+        } else {
+            hash_head = 0;
+            if (s->lookahead >= WANT_MIN_MATCH) {
+                hash_head = functable.quick_insert_string(s, s->strstart);
+            }
+
+            current_match.strstart = (uint16_t)s->strstart;
+            current_match.orgstart = current_match.strstart;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match_length < WANT_MIN_MATCH
+             */
+
+            dist = (int64_t)s->strstart - hash_head;
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                current_match.match_start = (uint16_t)s->match_start;
+                if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
+                    current_match.match_length = 1;
+                if (UNLIKELY(current_match.match_start >= current_match.strstart)) {
+                    /* this can happen due to some restarts */
+                    current_match.match_length = 1;
+                }
+            } else {
+                /* Set up the match to be a 1 byte literal */
+                current_match.match_start = 0;
+                current_match.match_length = 1;
+            }
+        }
+
+        insert_match(s, current_match);
+
+        /* now, look ahead one */
+        if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
+            s->strstart = current_match.strstart + current_match.match_length;
+            hash_head = functable.quick_insert_string(s, s->strstart);
+
+            next_match.strstart = (uint16_t)s->strstart;
+            next_match.orgstart = next_match.strstart;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match_length < WANT_MIN_MATCH
+             */
+
+            dist = (int64_t)s->strstart - hash_head;
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                next_match.match_start = (uint16_t)s->match_start;
+                if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
+                    /* this can happen due to some restarts */
+                    next_match.match_length = 1;
+                }
+                if (next_match.match_length < WANT_MIN_MATCH)
+                    next_match.match_length = 1;
+                else
+                    fizzle_matches(s, &current_match, &next_match);
+            } else {
+                /* Set up the match to be a 1 byte literal */
+                next_match.match_start = 0;
+                next_match.match_length = 1;
+            }
+
+            s->strstart = current_match.strstart;
+        } else {
+            next_match.match_length = 0;
+        }
+
+        /* now emit the current match */
+        bflush = emit_match(s, current_match);
+
+        /* move the "cursor" forward */
+        s->strstart += current_match.match_length;
+
+        if (UNLIKELY(bflush))
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+
+    return block_done;
+}
+#endif
--- a/3rdparty/zlib-ng/deflate_p.h
+++ b/3rdparty/zlib-ng/deflate_p.h
@ -0,0 +1,116 @@
+/* deflate_p.h -- Private inline functions and macros shared with more than
+ *                one deflate method
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifndef DEFLATE_P_H
+#define DEFLATE_P_H
+
+/* Forward declare common non-inlined functions declared in deflate.c */
+
+#ifdef ZLIB_DEBUG
+/* ===========================================================================
+ * Check that the match at match_start is indeed a match.
+ */
+static inline void check_match(deflate_state *s, Pos start, Pos match, int length) {
+    /* check that the match length is valid*/
+    if (length < STD_MIN_MATCH || length > STD_MAX_MATCH) {
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        z_error("invalid match length");
+    }
+    /* check that the match isn't at the same position as the start string */
+    if (match == start) {
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        z_error("invalid match position");
+    }
+    /* check that the match is indeed a match */
+    if (memcmp(s->window + match, s->window + start, length) != 0) {
+        int32_t i = 0;
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        do {
+            fprintf(stderr, "  %03d: match [%02x] start [%02x]\n", i++,
+                s->window[match++], s->window[start++]);
+        } while (--length != 0);
+        z_error("invalid match");
+    }
+    if (z_verbose > 1) {
+        fprintf(stderr, "\\[%u,%d]", start-match, length);
+        do {
+            putc(s->window[start++], stderr);
+        } while (--length != 0);
+    }
+}
+#else
+#define check_match(s, start, match, length)
+#endif
+
+Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm);
+Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
+
+/* ===========================================================================
+ * Save the match info and tally the frequency counts. Return true if
+ * the current block must be flushed.
+ */
+
+extern const unsigned char Z_INTERNAL zng_length_code[];
+extern const unsigned char Z_INTERNAL zng_dist_code[];
+
+static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
+    /* c is the unmatched char */
+    s->sym_buf[s->sym_next++] = 0;
+    s->sym_buf[s->sym_next++] = 0;
+    s->sym_buf[s->sym_next++] = c;
+    s->dyn_ltree[c].Freq++;
+    Tracevv((stderr, "%c", c));
+    Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
+    return (s->sym_next == s->sym_end);
+}
+
+static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
+    /* dist: distance of matched string */
+    /* len: match length-STD_MIN_MATCH */
+    s->sym_buf[s->sym_next++] = (uint8_t)(dist);
+    s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
+    s->sym_buf[s->sym_next++] = (uint8_t)len;
+    s->matches++;
+    dist--;
+    Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
+        "zng_tr_tally: bad match");
+
+    s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
+    s->dyn_dtree[d_code(dist)].Freq++;
+    return (s->sym_next == s->sym_end);
+}
+
+/* ===========================================================================
+ * Flush the current block, with given end-of-file flag.
+ * IN assertion: strstart is set to the end of the current match.
+ */
+#define FLUSH_BLOCK_ONLY(s, last) { \
+    zng_tr_flush_block(s, (s->block_start >= 0 ? \
+                   (char *)&s->window[(unsigned)s->block_start] : \
+                   NULL), \
+                   (uint32_t)((int)s->strstart - s->block_start), \
+                   (last)); \
+    s->block_start = (int)s->strstart; \
+    PREFIX(flush_pending)(s->strm); \
+}
+
+/* Same but force premature exit if necessary. */
+#define FLUSH_BLOCK(s, last) { \
+    FLUSH_BLOCK_ONLY(s, last); \
+    if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
+}
+
+/* Maximum stored block length in deflate format (not including header). */
+#define MAX_STORED 65535
+
+/* Compression function. Returns the block state after the call. */
+typedef block_state (*compress_func) (deflate_state *s, int flush);
+/* Match function. Returns the longest match. */
+typedef uint32_t    (*match_func)    (deflate_state *const s, Pos cur_match);
+
+#endif
--- a/3rdparty/zlib-ng/deflate_quick.c
+++ b/3rdparty/zlib-ng/deflate_quick.c
@ -0,0 +1,129 @@
+/*
+ * The deflate_quick deflate strategy, designed to be used when cycles are
+ * at a premium.
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+#include "trees_emit.h"
+
+extern const ct_data static_ltree[L_CODES+2];
+extern const ct_data static_dtree[D_CODES];
+
+#define QUICK_START_BLOCK(s, last) { \
+    zng_tr_emit_tree(s, STATIC_TREES, last); \
+    s->block_open = 1 + (int)last; \
+    s->block_start = (int)s->strstart; \
+}
+
+#define QUICK_END_BLOCK(s, last) { \
+    if (s->block_open) { \
+        zng_tr_emit_end_block(s, static_ltree, last); \
+        s->block_open = 0; \
+        s->block_start = (int)s->strstart; \
+        PREFIX(flush_pending)(s->strm); \
+        if (s->strm->avail_out == 0) \
+            return (last) ? finish_started : need_more; \
+    } \
+}
+
+Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
+    Pos hash_head;
+    int64_t dist;
+    unsigned match_len, last;
+
+
+    last = (flush == Z_FINISH) ? 1 : 0;
+    if (UNLIKELY(last && s->block_open != 2)) {
+        /* Emit end of previous block */
+        QUICK_END_BLOCK(s, 0);
+        /* Emit start of last block */
+        QUICK_START_BLOCK(s, last);
+    } else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) {
+        /* Start new block only when we have lookahead data, so that if no
+           input data is given an empty block will not be written */
+        QUICK_START_BLOCK(s, last);
+    }
+
+    for (;;) {
+        if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) {
+            PREFIX(flush_pending)(s->strm);
+            if (s->strm->avail_out == 0) {
+                return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more;
+            }
+        }
+
+        if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break;
+
+            if (UNLIKELY(s->block_open == 0)) {
+                /* Start new block when we have lookahead data, so that if no
+                   input data is given an empty block will not be written */
+                QUICK_START_BLOCK(s, last);
+            }
+        }
+
+        if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
+            hash_head = functable.quick_insert_string(s, s->strstart);
+            dist = (int64_t)s->strstart - hash_head;
+
+            if (dist <= MAX_DIST(s) && dist > 0) {
+                const uint8_t *str_start = s->window + s->strstart;
+                const uint8_t *match_start = s->window + hash_head;
+
+                if (zng_memcmp_2(str_start, match_start) == 0) {
+                    match_len = functable.compare256(str_start+2, match_start+2) + 2;
+
+                    if (match_len >= WANT_MIN_MATCH) {
+                        if (UNLIKELY(match_len > s->lookahead))
+                            match_len = s->lookahead;
+                        if (UNLIKELY(match_len > STD_MAX_MATCH))
+                            match_len = STD_MAX_MATCH;
+
+                        check_match(s, s->strstart, hash_head, match_len);
+
+                        zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist);
+                        s->lookahead -= match_len;
+                        s->strstart += match_len;
+                        continue;
+                    }
+                }
+            }
+        }
+
+        zng_tr_emit_lit(s, static_ltree, s->window[s->strstart]);
+        s->strstart++;
+        s->lookahead--;
+    }
+
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(last)) {
+        QUICK_END_BLOCK(s, 1);
+        return finish_done;
+    }
+
+    QUICK_END_BLOCK(s, 0);
+    return block_done;
+}
--- a/3rdparty/zlib-ng/deflate_rle.c
+++ b/3rdparty/zlib-ng/deflate_rle.c
@ -0,0 +1,85 @@
+/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "compare256_rle.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+#ifdef UNALIGNED_OK
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define compare256_rle compare256_rle_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define compare256_rle compare256_rle_unaligned_32
+#  else
+#    define compare256_rle compare256_rle_unaligned_16
+#  endif
+#else
+#  define compare256_rle compare256_rle_c
+#endif
+
+/* ===========================================================================
+ * For Z_RLE, simply look for runs of bytes, generate matches only of distance
+ * one.  Do not maintain a hash table.  (It will be regenerated if this run of
+ * deflate switches away from Z_RLE.)
+ */
+Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
+    int bflush = 0;                 /* set if current block must be flushed */
+    unsigned char *scan;            /* scan goes up to strend for length of run */
+    uint32_t match_len = 0;
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the longest run, plus one for the unrolled loop.
+         */
+        if (s->lookahead <= STD_MAX_MATCH) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead <= STD_MAX_MATCH && flush == Z_NO_FLUSH)
+                return need_more;
+            if (s->lookahead == 0)
+                break; /* flush the current block */
+        }
+
+        /* See how many times the previous byte repeats */
+        if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
+            scan = s->window + s->strstart - 1;
+            if (scan[0] == scan[1] && scan[1] == scan[2]) {
+                match_len = compare256_rle(scan, scan+3)+2;
+                match_len = MIN(match_len, s->lookahead);
+                match_len = MIN(match_len, STD_MAX_MATCH);
+            }
+            Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
+        }
+
+        /* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */
+        if (match_len >= STD_MIN_MATCH) {
+            check_match(s, s->strstart, s->strstart - 1, match_len);
+
+            bflush = zng_tr_tally_dist(s, 1, match_len - STD_MIN_MATCH);
+
+            s->lookahead -= match_len;
+            s->strstart += match_len;
+            match_len = 0;
+        } else {
+            /* No match, output a literal byte */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (bflush)
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->sym_next)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
--- a/3rdparty/zlib-ng/deflate_slow.c
+++ b/3rdparty/zlib-ng/deflate_slow.c
@ -0,0 +1,143 @@
+/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Same as deflate_medium, but achieves better compression. We use a lazy
+ * evaluation for matches: a match is finally adopted only if there is
+ * no better match at the next window position.
+ */
+Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
+    Pos hash_head;           /* head of hash chain */
+    int bflush;              /* set if current block must be flushed */
+    int64_t dist;
+    uint32_t match_len;
+    match_func *longest_match;
+
+    if (s->max_chain_length <= 1024)
+        longest_match = &functable.longest_match;
+    else
+        longest_match = &functable.longest_match_slow;
+
+    /* Process the input block. */
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        hash_head = 0;
+        if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
+            hash_head = s->quick_insert_string(s, s->strstart);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         */
+        s->prev_match = (Pos)s->match_start;
+        match_len = STD_MIN_MATCH - 1;
+        dist = (int64_t)s->strstart - hash_head;
+
+        if (dist <= MAX_DIST(s) && dist > 0 && s->prev_length < s->max_lazy_match && hash_head != 0) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            match_len = (*longest_match)(s, hash_head);
+            /* longest_match() sets match_start */
+
+            if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
+                /* If prev_match is also WANT_MIN_MATCH, match_start is garbage
+                 * but we will ignore the current match anyway.
+                 */
+                match_len = STD_MIN_MATCH - 1;
+            }
+        }
+        /* If there was a match at the previous step and the current
+         * match is not better, output the previous match:
+         */
+        if (s->prev_length >= STD_MIN_MATCH && match_len <= s->prev_length) {
+            unsigned int max_insert = s->strstart + s->lookahead - STD_MIN_MATCH;
+            /* Do not insert strings in hash table beyond this. */
+
+            check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+
+            bflush = zng_tr_tally_dist(s, s->strstart -1 - s->prev_match, s->prev_length - STD_MIN_MATCH);
+
+            /* Insert in hash table all strings up to the end of the match.
+             * strstart-1 and strstart are already inserted. If there is not
+             * enough lookahead, the last two strings are not inserted in
+             * the hash table.
+             */
+            s->prev_length -= 1;
+            s->lookahead -= s->prev_length;
+
+            unsigned int mov_fwd = s->prev_length - 1;
+            if (max_insert > s->strstart) {
+                unsigned int insert_cnt = mov_fwd;
+                if (UNLIKELY(insert_cnt > max_insert - s->strstart))
+                    insert_cnt = max_insert - s->strstart;
+                s->insert_string(s, s->strstart + 1, insert_cnt);
+            }
+            s->prev_length = 0;
+            s->match_available = 0;
+            s->strstart += mov_fwd + 1;
+
+            if (UNLIKELY(bflush))
+                FLUSH_BLOCK(s, 0);
+
+        } else if (s->match_available) {
+            /* If there was no match at the previous position, output a
+             * single literal. If there was a match but the current match
+             * is longer, truncate the previous match to a single literal.
+             */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart-1]);
+            if (UNLIKELY(bflush))
+                FLUSH_BLOCK_ONLY(s, 0);
+            s->prev_length = match_len;
+            s->strstart++;
+            s->lookahead--;
+            if (UNLIKELY(s->strm->avail_out == 0))
+                return need_more;
+        } else {
+            /* There is no previous match to compare with, wait for
+             * the next step to decide.
+             */
+            s->prev_length = match_len;
+            s->match_available = 1;
+            s->strstart++;
+            s->lookahead--;
+        }
+    }
+    Assert(flush != Z_NO_FLUSH, "no flush?");
+    if (UNLIKELY(s->match_available)) {
+        (void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
+        s->match_available = 0;
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(flush == Z_FINISH)) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
--- a/3rdparty/zlib-ng/deflate_stored.c
+++ b/3rdparty/zlib-ng/deflate_stored.c
@ -0,0 +1,186 @@
+/* deflate_stored.c -- store data without compression using deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Copy without compression as much as possible from the input stream, return
+ * the current block state.
+ *
+ * In case deflateParams() is used to later switch to a non-zero compression
+ * level, s->matches (otherwise unused when storing) keeps track of the number
+ * of hash table slides to perform. If s->matches is 1, then one hash table
+ * slide will be done when switching. If s->matches is 2, the maximum value
+ * allowed here, then the hash table will be cleared, since two or more slides
+ * is the same as a clear.
+ *
+ * deflate_stored() is written to minimize the number of times an input byte is
+ * copied. It is most efficient with large input and output buffers, which
+ * maximizes the opportunites to have a single copy from next_in to next_out.
+ */
+Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
+    /* Smallest worthy block size when not flushing or finishing. By default
+     * this is 32K. This can be as small as 507 bytes for memLevel == 1. For
+     * large input and output buffers, the stored block size will be larger.
+     */
+    unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
+
+    /* Copy as many min_block or larger stored blocks directly to next_out as
+     * possible. If flushing, copy the remaining available input to next_out as
+     * stored blocks, if there is enough space.
+     */
+    unsigned len, left, have, last = 0;
+    unsigned used = s->strm->avail_in;
+    do {
+        /* Set len to the maximum size block that we can copy directly with the
+         * available input data and output space. Set left to how much of that
+         * would be copied from what's left in the window.
+         */
+        len = MAX_STORED;       /* maximum deflate stored block length */
+        have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        if (s->strm->avail_out < have)          /* need room for header */
+            break;
+            /* maximum stored block length that will fit in avail_out: */
+        have = s->strm->avail_out - have;
+        left = (int)s->strstart - s->block_start;    /* bytes left in window */
+        if (len > (unsigned long)left + s->strm->avail_in)
+            len = left + s->strm->avail_in;     /* limit len to the input */
+        len = MIN(len, have);                   /* limit len to the output */
+
+        /* If the stored block would be less than min_block in length, or if
+         * unable to copy all of the available input when flushing, then try
+         * copying to the window and the pending buffer instead. Also don't
+         * write an empty block when flushing -- deflate() does that.
+         */
+        if (len < min_block && ((len == 0 && flush != Z_FINISH) || flush == Z_NO_FLUSH || len != left + s->strm->avail_in))
+            break;
+
+        /* Make a dummy stored block in pending to get the header bytes,
+         * including any pending bits. This also updates the debugging counts.
+         */
+        last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0;
+        zng_tr_stored_block(s, (char *)0, 0L, last);
+
+        /* Replace the lengths in the dummy stored block with len. */
+        s->pending -= 4;
+        put_short(s, (uint16_t)len);
+        put_short(s, (uint16_t)~len);
+
+        /* Write the stored block header bytes. */
+        PREFIX(flush_pending)(s->strm);
+
+        /* Update debugging counts for the data about to be copied. */
+        cmpr_bits_add(s, len << 3);
+        sent_bits_add(s, len << 3);
+
+        /* Copy uncompressed bytes from the window to next_out. */
+        if (left) {
+            left = MIN(left, len);
+            memcpy(s->strm->next_out, s->window + s->block_start, left);
+            s->strm->next_out += left;
+            s->strm->avail_out -= left;
+            s->strm->total_out += left;
+            s->block_start += (int)left;
+            len -= left;
+        }
+
+        /* Copy uncompressed bytes directly from next_in to next_out, updating
+         * the check value.
+         */
+        if (len) {
+            PREFIX(read_buf)(s->strm, s->strm->next_out, len);
+            s->strm->next_out += len;
+            s->strm->avail_out -= len;
+            s->strm->total_out += len;
+        }
+    } while (last == 0);
+
+    /* Update the sliding window with the last s->w_size bytes of the copied
+     * data, or append all of the copied data to the existing window if less
+     * than s->w_size bytes were copied. Also update the number of bytes to
+     * insert in the hash tables, in the event that deflateParams() switches to
+     * a non-zero compression level.
+     */
+    used -= s->strm->avail_in;      /* number of input bytes directly copied */
+    if (used) {
+        /* If any input was used, then no unused input remains in the window,
+         * therefore s->block_start == s->strstart.
+         */
+        if (used >= s->w_size) {    /* supplant the previous history */
+            s->matches = 2;         /* clear hash */
+            memcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
+            s->strstart = s->w_size;
+            s->insert = s->strstart;
+        } else {
+            if (s->window_size - s->strstart <= used) {
+                /* Slide the window down. */
+                s->strstart -= s->w_size;
+                memcpy(s->window, s->window + s->w_size, s->strstart);
+                if (s->matches < 2)
+                    s->matches++;   /* add a pending slide_hash() */
+                s->insert = MIN(s->insert, s->strstart);
+            }
+            memcpy(s->window + s->strstart, s->strm->next_in - used, used);
+            s->strstart += used;
+            s->insert += MIN(used, s->w_size - s->insert);
+        }
+        s->block_start = (int)s->strstart;
+    }
+    s->high_water = MAX(s->high_water, s->strstart);
+
+    /* If the last block was written to next_out, then done. */
+    if (last)
+        return finish_done;
+
+    /* If flushing and all input has been consumed, then done. */
+    if (flush != Z_NO_FLUSH && flush != Z_FINISH && s->strm->avail_in == 0 && (int)s->strstart == s->block_start)
+        return block_done;
+
+    /* Fill the window with any remaining input. */
+    have = s->window_size - s->strstart;
+    if (s->strm->avail_in > have && s->block_start >= (int)s->w_size) {
+        /* Slide the window down. */
+        s->block_start -= (int)s->w_size;
+        s->strstart -= s->w_size;
+        memcpy(s->window, s->window + s->w_size, s->strstart);
+        if (s->matches < 2)
+            s->matches++;           /* add a pending slide_hash() */
+        have += s->w_size;          /* more space now */
+        s->insert = MIN(s->insert, s->strstart);
+    }
+
+    have = MIN(have, s->strm->avail_in);
+    if (have) {
+        PREFIX(read_buf)(s->strm, s->window + s->strstart, have);
+        s->strstart += have;
+        s->insert += MIN(have, s->w_size - s->insert);
+    }
+    s->high_water = MAX(s->high_water, s->strstart);
+
+    /* There was not enough avail_out to write a complete worthy or flushed
+     * stored block to next_out. Write a stored block to pending instead, if we
+     * have enough input for a worthy block, or if flushing and there is enough
+     * room for the remaining input as a stored block in the pending buffer.
+     */
+    have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        /* maximum stored block length that will fit in pending: */
+    have = MIN(s->pending_buf_size - have, MAX_STORED);
+    min_block = MIN(have, s->w_size);
+    left = (int)s->strstart - s->block_start;
+    if (left >= min_block || ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH && s->strm->avail_in == 0 && left <= have)) {
+        len = MIN(left, have);
+        last = flush == Z_FINISH && s->strm->avail_in == 0 && len == left ? 1 : 0;
+        zng_tr_stored_block(s, (char *)s->window + s->block_start, len, last);
+        s->block_start += (int)len;
+        PREFIX(flush_pending)(s->strm);
+    }
+
+    /* We've done all we can with the available input and output. */
+    return last ? finish_started : need_more;
+}
--- a/3rdparty/zlib-ng/fallback_builtins.h
+++ b/3rdparty/zlib-ng/fallback_builtins.h
@ -0,0 +1,50 @@
+#ifndef FALLBACK_BUILTINS_H
+#define FALLBACK_BUILTINS_H
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#include <intrin.h>
+#ifdef X86_FEATURES
+#  include "arch/x86/x86_features.h"
+#endif
+
+/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
+ * Because of that assumption trailing_zero is not initialized and the return value is not checked.
+ * Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
+ * If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
+ * Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
+ */
+static __forceinline int __builtin_ctz(unsigned int value) {
+    Assert(value != 0, "Invalid input value: 0");
+# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
+    return (int)_tzcnt_u32(value);
+# else
+    unsigned long trailing_zero;
+    _BitScanForward(&trailing_zero, value);
+    return (int)trailing_zero;
+# endif
+}
+#define HAVE_BUILTIN_CTZ
+
+#ifdef _M_AMD64
+/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
+ * Because of that assumption trailing_zero is not initialized and the return value is not checked.
+ */
+static __forceinline int __builtin_ctzll(unsigned long long value) {
+    Assert(value != 0, "Invalid input value: 0");
+# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
+    return (int)_tzcnt_u64(value);
+# else
+    unsigned long trailing_zero;
+    _BitScanForward64(&trailing_zero, value);
+    return (int)trailing_zero;
+# endif
+}
+#define HAVE_BUILTIN_CTZLL
+#endif // Microsoft AMD64
+
+#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
+#endif // _MSC_VER & !clang
+
+#endif // include guard FALLBACK_BUILTINS_H
--- a/3rdparty/zlib-ng/functable.c
+++ b/3rdparty/zlib-ng/functable.c
@ -0,0 +1,403 @@
+/* functable.c -- Choose relevant optimized functions at runtime
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "crc32_braid_p.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+#include "cpu_features.h"
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/* Platform has pointer size atomic store */
+#if defined(__GNUC__) || defined(__clang__)
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
+#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
+#elif defined(_MSC_VER)
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
+#  if defined(_M_ARM) || defined(_M_ARM64)
+#    define FUNCTABLE_BARRIER() do { \
+    _ReadWriteBarrier();  \
+    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
+    _ReadWriteBarrier(); \
+} while (0)
+#  else
+#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
+#  endif
+#else
+#  warning Unable to detect atomic intrinsic support.
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
+#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
+#endif
+
+static void force_init_empty(void) {
+    // empty
+}
+
+static void init_functable(void) {
+    struct functable_s ft;
+    struct cpu_features cf;
+
+    cpu_check_features(&cf);
+
+    // Generic code
+    ft.force_init = &force_init_empty;
+    ft.adler32 = &adler32_c;
+    ft.adler32_fold_copy = &adler32_fold_copy_c;
+    ft.chunkmemset_safe = &chunkmemset_safe_c;
+    ft.chunksize = &chunksize_c;
+    ft.crc32 = &PREFIX(crc32_braid);
+    ft.crc32_fold = &crc32_fold_c;
+    ft.crc32_fold_copy = &crc32_fold_copy_c;
+    ft.crc32_fold_final = &crc32_fold_final_c;
+    ft.crc32_fold_reset = &crc32_fold_reset_c;
+    ft.inflate_fast = &inflate_fast_c;
+    ft.insert_string = &insert_string_c;
+    ft.quick_insert_string = &quick_insert_string_c;
+    ft.slide_hash = &slide_hash_c;
+    ft.update_hash = &update_hash_c;
+
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    ft.longest_match = &longest_match_unaligned_64;
+    ft.longest_match_slow = &longest_match_slow_unaligned_64;
+    ft.compare256 = &compare256_unaligned_64;
+#  elif defined(HAVE_BUILTIN_CTZ)
+    ft.longest_match = &longest_match_unaligned_32;
+    ft.longest_match_slow = &longest_match_slow_unaligned_32;
+    ft.compare256 = &compare256_unaligned_32;
+#  else
+    ft.longest_match = &longest_match_unaligned_16;
+    ft.longest_match_slow = &longest_match_slow_unaligned_16;
+    ft.compare256 = &compare256_unaligned_16;
+#  endif
+#else
+    ft.longest_match = &longest_match_c;
+    ft.longest_match_slow = &longest_match_slow_c;
+    ft.compare256 = &compare256_c;
+#endif
+
+
+    // Select arch-optimized functions
+
+    // X86 - SSE2
+#ifdef X86_SSE2
+#  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (cf.x86.has_sse2)
+#  endif
+    {
+        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
+        ft.chunksize = &chunksize_sse2;
+        ft.inflate_fast = &inflate_fast_sse2;
+        ft.slide_hash = &slide_hash_sse2;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_sse2;
+        ft.longest_match = &longest_match_sse2;
+        ft.longest_match_slow = &longest_match_slow_sse2;
+#  endif
+    }
+#endif
+    // X86 - SSSE3
+#ifdef X86_SSSE3
+    if (cf.x86.has_ssse3) {
+        ft.adler32 = &adler32_ssse3;
+#  ifdef X86_SSE2
+        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
+        ft.inflate_fast = &inflate_fast_ssse3;
+#  endif
+    }
+#endif
+    // X86 - SSE4.2
+#ifdef X86_SSE42
+    if (cf.x86.has_sse42) {
+        ft.adler32_fold_copy = &adler32_fold_copy_sse42;
+        ft.insert_string = &insert_string_sse42;
+        ft.quick_insert_string = &quick_insert_string_sse42;
+        ft.update_hash = &update_hash_sse42;
+    }
+#endif
+    // X86 - PCLMUL
+#ifdef X86_PCLMULQDQ_CRC
+    if (cf.x86.has_pclmulqdq) {
+        ft.crc32 = &crc32_pclmulqdq;
+        ft.crc32_fold = &crc32_fold_pclmulqdq;
+        ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
+        ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
+        ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
+    }
+#endif
+    // X86 - AVX
+#ifdef X86_AVX2
+    if (cf.x86.has_avx2) {
+        ft.adler32 = &adler32_avx2;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx2;
+        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
+        ft.chunksize = &chunksize_avx2;
+        ft.inflate_fast = &inflate_fast_avx2;
+        ft.slide_hash = &slide_hash_avx2;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_avx2;
+        ft.longest_match = &longest_match_avx2;
+        ft.longest_match_slow = &longest_match_slow_avx2;
+#  endif
+    }
+#endif
+#ifdef X86_AVX512
+    if (cf.x86.has_avx512) {
+        ft.adler32 = &adler32_avx512;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx512;
+    }
+#endif
+#ifdef X86_AVX512VNNI
+    if (cf.x86.has_avx512vnni) {
+        ft.adler32 = &adler32_avx512_vnni;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
+    }
+#endif
+    // X86 - VPCLMULQDQ
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
+        ft.crc32 = &crc32_vpclmulqdq;
+        ft.crc32_fold = &crc32_fold_vpclmulqdq;
+        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
+        ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
+        ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
+    }
+#endif
+
+
+    // ARM - SIMD
+#ifdef ARM_SIMD
+#  ifndef ARM_NOCHECK_SIMD
+    if (cf.arm.has_simd)
+#  endif
+    {
+        ft.slide_hash = &slide_hash_armv6;
+    }
+#endif
+    // ARM - NEON
+#ifdef ARM_NEON
+#  ifndef ARM_NOCHECK_NEON
+    if (cf.arm.has_neon)
+#  endif
+    {
+        ft.adler32 = &adler32_neon;
+        ft.chunkmemset_safe = &chunkmemset_safe_neon;
+        ft.chunksize = &chunksize_neon;
+        ft.inflate_fast = &inflate_fast_neon;
+        ft.slide_hash = &slide_hash_neon;
+#  ifdef HAVE_BUILTIN_CTZLL
+        ft.compare256 = &compare256_neon;
+        ft.longest_match = &longest_match_neon;
+        ft.longest_match_slow = &longest_match_slow_neon;
+#  endif
+    }
+#endif
+    // ARM - ACLE
+#ifdef ARM_ACLE
+    if (cf.arm.has_crc32) {
+        ft.crc32 = &crc32_acle;
+        ft.insert_string = &insert_string_acle;
+        ft.quick_insert_string = &quick_insert_string_acle;
+        ft.update_hash = &update_hash_acle;
+    }
+#endif
+
+
+    // Power - VMX
+#ifdef PPC_VMX
+    if (cf.power.has_altivec) {
+        ft.adler32 = &adler32_vmx;
+        ft.slide_hash = &slide_hash_vmx;
+    }
+#endif
+    // Power8 - VSX
+#ifdef POWER8_VSX
+    if (cf.power.has_arch_2_07) {
+        ft.adler32 = &adler32_power8;
+        ft.chunkmemset_safe = &chunkmemset_safe_power8;
+        ft.chunksize = &chunksize_power8;
+        ft.inflate_fast = &inflate_fast_power8;
+        ft.slide_hash = &slide_hash_power8;
+    }
+#endif
+#ifdef POWER8_VSX_CRC32
+    if (cf.power.has_arch_2_07)
+        ft.crc32 = &crc32_power8;
+#endif
+    // Power9
+#ifdef POWER9
+    if (cf.power.has_arch_3_00) {
+        ft.compare256 = &compare256_power9;
+        ft.longest_match = &longest_match_power9;
+        ft.longest_match_slow = &longest_match_slow_power9;
+    }
+#endif
+
+
+    // RISCV - RVV
+#ifdef RISCV_RVV
+    if (cf.riscv.has_rvv) {
+        ft.adler32 = &adler32_rvv;
+        ft.adler32_fold_copy = &adler32_fold_copy_rvv;
+        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
+        ft.chunksize = &chunksize_rvv;
+        ft.compare256 = &compare256_rvv;
+        ft.inflate_fast = &inflate_fast_rvv;
+        ft.longest_match = &longest_match_rvv;
+        ft.longest_match_slow = &longest_match_slow_rvv;
+        ft.slide_hash = &slide_hash_rvv;
+    }
+#endif
+
+
+    // S390
+#ifdef S390_CRC32_VX
+    if (cf.s390.has_vx)
+        ft.crc32 = crc32_s390_vx;
+#endif
+
+    // Assign function pointers individually for atomic operation
+    FUNCTABLE_ASSIGN(ft, force_init);
+    FUNCTABLE_ASSIGN(ft, adler32);
+    FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
+    FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
+    FUNCTABLE_ASSIGN(ft, chunksize);
+    FUNCTABLE_ASSIGN(ft, compare256);
+    FUNCTABLE_ASSIGN(ft, crc32);
+    FUNCTABLE_ASSIGN(ft, crc32_fold);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_final);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
+    FUNCTABLE_ASSIGN(ft, inflate_fast);
+    FUNCTABLE_ASSIGN(ft, insert_string);
+    FUNCTABLE_ASSIGN(ft, longest_match);
+    FUNCTABLE_ASSIGN(ft, longest_match_slow);
+    FUNCTABLE_ASSIGN(ft, quick_insert_string);
+    FUNCTABLE_ASSIGN(ft, slide_hash);
+    FUNCTABLE_ASSIGN(ft, update_hash);
+
+    // Memory barrier for weak memory order CPUs
+    FUNCTABLE_BARRIER();
+}
+
+/* stub functions */
+static void force_init_stub(void) {
+    init_functable();
+}
+
+static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
+    init_functable();
+    return functable.adler32(adler, buf, len);
+}
+
+static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
+    init_functable();
+    return functable.adler32_fold_copy(adler, dst, src, len);
+}
+
+static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
+    init_functable();
+    return functable.chunkmemset_safe(out, dist, len, left);
+}
+
+static uint32_t chunksize_stub(void) {
+    init_functable();
+    return functable.chunksize();
+}
+
+static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
+    init_functable();
+    return functable.compare256(src0, src1);
+}
+
+static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
+    init_functable();
+    return functable.crc32(crc, buf, len);
+}
+
+static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
+    init_functable();
+    functable.crc32_fold(crc, src, len, init_crc);
+}
+
+static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
+    init_functable();
+    functable.crc32_fold_copy(crc, dst, src, len);
+}
+
+static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
+    init_functable();
+    return functable.crc32_fold_final(crc);
+}
+
+static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
+    init_functable();
+    return functable.crc32_fold_reset(crc);
+}
+
+static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
+    init_functable();
+    functable.inflate_fast(strm, start);
+}
+
+static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
+    init_functable();
+    functable.insert_string(s, str, count);
+}
+
+static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
+    init_functable();
+    return functable.longest_match(s, cur_match);
+}
+
+static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
+    init_functable();
+    return functable.longest_match_slow(s, cur_match);
+}
+
+static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
+    init_functable();
+    return functable.quick_insert_string(s, str);
+}
+
+static void slide_hash_stub(deflate_state* s) {
+    init_functable();
+    functable.slide_hash(s);
+}
+
+static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
+    init_functable();
+    return functable.update_hash(s, h, val);
+}
+
+/* functable init */
+Z_INTERNAL struct functable_s functable = {
+    force_init_stub,
+    adler32_stub,
+    adler32_fold_copy_stub,
+    chunkmemset_safe_stub,
+    chunksize_stub,
+    compare256_stub,
+    crc32_stub,
+    crc32_fold_stub,
+    crc32_fold_copy_stub,
+    crc32_fold_final_stub,
+    crc32_fold_reset_stub,
+    inflate_fast_stub,
+    insert_string_stub,
+    longest_match_stub,
+    longest_match_slow_stub,
+    quick_insert_string_stub,
+    slide_hash_stub,
+    update_hash_stub
+};
--- a/3rdparty/zlib-ng/functable.h
+++ b/3rdparty/zlib-ng/functable.h
@ -0,0 +1,42 @@
+/* functable.h -- Struct containing function pointers to optimized functions
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef FUNCTABLE_H_
+#define FUNCTABLE_H_
+
+#include "deflate.h"
+#include "crc32_fold.h"
+#include "adler32_fold.h"
+
+#ifdef ZLIB_COMPAT
+typedef struct z_stream_s z_stream;
+#else
+typedef struct zng_stream_s zng_stream;
+#endif
+
+struct functable_s {
+    void     (* force_init)         (void);
+    uint32_t (* adler32)            (uint32_t adler, const uint8_t *buf, size_t len);
+    uint32_t (* adler32_fold_copy)  (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+    uint8_t* (* chunkmemset_safe)   (uint8_t *out, unsigned dist, unsigned len, unsigned left);
+    uint32_t (* chunksize)          (void);
+    uint32_t (* compare256)         (const uint8_t *src0, const uint8_t *src1);
+    uint32_t (* crc32)              (uint32_t crc, const uint8_t *buf, size_t len);
+    void     (* crc32_fold)         (struct crc32_fold_s *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+    void     (* crc32_fold_copy)    (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len);
+    uint32_t (* crc32_fold_final)   (struct crc32_fold_s *crc);
+    uint32_t (* crc32_fold_reset)   (struct crc32_fold_s *crc);
+    void     (* inflate_fast)       (PREFIX3(stream) *strm, uint32_t start);
+    void     (* insert_string)      (deflate_state *const s, uint32_t str, uint32_t count);
+    uint32_t (* longest_match)      (deflate_state *const s, Pos cur_match);
+    uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
+    Pos      (* quick_insert_string)(deflate_state *const s, uint32_t str);
+    void     (* slide_hash)         (deflate_state *s);
+    uint32_t (* update_hash)        (deflate_state *const s, uint32_t h, uint32_t val);
+};
+
+Z_INTERNAL extern struct functable_s functable;
+
+#endif
--- a/3rdparty/zlib-ng/gzguts.h
+++ b/3rdparty/zlib-ng/gzguts.h
@ -0,0 +1,144 @@
+#ifndef GZGUTS_H_
+#define GZGUTS_H_
+/* gzguts.h -- zlib internal header definitions for gz* operations
+ * Copyright (C) 2004-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef _LARGEFILE64_SOURCE
+#  ifndef _LARGEFILE_SOURCE
+#    define _LARGEFILE_SOURCE 1
+#  endif
+#  undef _FILE_OFFSET_BITS
+#  undef _TIME_BITS
+#endif
+
+#if defined(HAVE_VISIBILITY_INTERNAL)
+#  define Z_INTERNAL __attribute__((visibility ("internal")))
+#elif defined(HAVE_VISIBILITY_HIDDEN)
+#  define Z_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define Z_INTERNAL
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <fcntl.h>
+
+#if defined(ZLIB_COMPAT)
+#  include "zlib.h"
+#else
+#  include "zlib-ng.h"
+#endif
+
+#ifdef _WIN32
+#  include <stddef.h>
+#endif
+
+#if defined(_WIN32)
+#  include <io.h>
+#  define WIDECHAR
+#endif
+
+#ifdef WINAPI_FAMILY
+#  define open _open
+#  define read _read
+#  define write _write
+#  define close _close
+#endif
+
+/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#if !defined(STDC99) && !defined(__CYGWIN__) && !defined(__MINGW__) && defined(_WIN32)
+#  if !defined(vsnprintf)
+#    if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
+#       define vsnprintf _vsnprintf
+#    endif
+#  endif
+#endif
+
+/* unlike snprintf (which is required in C99), _snprintf does not guarantee
+   null termination of the result -- however this is only used in gzlib.c
+   where the result is assured to fit in the space provided */
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#  define snprintf _snprintf
+#endif
+
+/* get errno and strerror definition */
+#ifndef NO_STRERROR
+#  include <errno.h>
+#  define zstrerror() strerror(errno)
+#else
+#  define zstrerror() "stdio error (consult errno)"
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+/* default i/o buffer size -- double this for output when reading (this and
+   twice this must be able to fit in an unsigned type) */
+#ifndef GZBUFSIZE
+#  define GZBUFSIZE 131072
+#endif
+
+/* gzip modes, also provide a little integrity check on the passed structure */
+#define GZ_NONE 0
+#define GZ_READ 7247
+#define GZ_WRITE 31153
+#define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
+
+/* values for gz_state how */
+#define LOOK 0      /* look for a gzip header */
+#define COPY 1      /* copy input directly */
+#define GZIP 2      /* decompress a gzip stream */
+
+/* internal gzip file state data structure */
+typedef struct {
+        /* exposed contents for gzgetc() macro */
+    struct gzFile_s x;      /* "x" for exposed */
+                            /* x.have: number of bytes available at x.next */
+                            /* x.next: next output data to deliver or write */
+                            /* x.pos: current position in uncompressed data */
+        /* used for both reading and writing */
+    int mode;               /* see gzip modes above */
+    int fd;                 /* file descriptor */
+    char *path;             /* path or fd for error messages */
+    unsigned size;          /* buffer size, zero if not allocated yet */
+    unsigned want;          /* requested buffer size, default is GZBUFSIZE */
+    unsigned char *in;      /* input buffer (double-sized when writing) */
+    unsigned char *out;     /* output buffer (double-sized when reading) */
+    int direct;             /* 0 if processing gzip, 1 if transparent */
+        /* just for reading */
+    int how;                /* 0: get header, 1: copy, 2: decompress */
+    z_off64_t start;        /* where the gzip data started, for rewinding */
+    int eof;                /* true if end of input file reached */
+    int past;               /* true if read requested past end */
+        /* just for writing */
+    int level;              /* compression level */
+    int strategy;           /* compression strategy */
+    int reset;              /* true if a reset is pending after a Z_FINISH */
+        /* seek request */
+    z_off64_t skip;         /* amount to skip (already rewound if backwards) */
+    int seek;               /* true if seek request pending */
+        /* error information */
+    int err;                /* error code */
+    char *msg;              /* error message */
+        /* zlib inflate or deflate stream */
+    PREFIX3(stream) strm;  /* stream structure in-place (not a pointer) */
+} gz_state;
+typedef gz_state *gz_statep;
+
+/* shared functions */
+void Z_INTERNAL gz_error(gz_state *, int, const char *);
+
+/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
+   value -- needed when comparing unsigned to z_off64_t, which is signed
+   (possible z_off64_t types off_t, off64_t, and long are all signed) */
+#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
+
+#endif /* GZGUTS_H_ */
--- a/3rdparty/zlib-ng/gzlib.c
+++ b/3rdparty/zlib-ng/gzlib.c
@ -0,0 +1,525 @@
+/* gzlib.c -- zlib functions common to reading and writing gzip files
+ * Copyright (C) 2004-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "gzguts.h"
+
+#if defined(_WIN32)
+#  define LSEEK _lseeki64
+#else
+#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
+#  define LSEEK lseek64
+#else
+#  define LSEEK lseek
+#endif
+#endif
+
+/* Local functions */
+static void gz_reset(gz_state *);
+static gzFile gz_open(const void *, int, const char *);
+
+/* Reset gzip file state */
+static void gz_reset(gz_state *state) {
+    state->x.have = 0;              /* no output data available */
+    if (state->mode == GZ_READ) {   /* for reading ... */
+        state->eof = 0;             /* not at end of file */
+        state->past = 0;            /* have not read past end yet */
+        state->how = LOOK;          /* look for gzip header */
+    }
+    else                            /* for writing ... */
+        state->reset = 0;           /* no deflateReset pending */
+    state->seek = 0;                /* no seek request pending */
+    gz_error(state, Z_OK, NULL);    /* clear error */
+    state->x.pos = 0;               /* no uncompressed data yet */
+    state->strm.avail_in = 0;       /* no input data yet */
+}
+
+/* Open a gzip file either by name or file descriptor. */
+static gzFile gz_open(const void *path, int fd, const char *mode) {
+    gz_state *state;
+    size_t len;
+    int oflag;
+#ifdef O_CLOEXEC
+    int cloexec = 0;
+#endif
+#ifdef O_EXCL
+    int exclusive = 0;
+#endif
+
+    /* check input */
+    if (path == NULL)
+        return NULL;
+
+    /* allocate gzFile structure to return */
+    state = (gz_state *)zng_alloc(sizeof(gz_state));
+    if (state == NULL)
+        return NULL;
+    state->size = 0;            /* no buffers allocated yet */
+    state->want = GZBUFSIZE;    /* requested buffer size */
+    state->msg = NULL;          /* no error message yet */
+
+    /* interpret mode */
+    state->mode = GZ_NONE;
+    state->level = Z_DEFAULT_COMPRESSION;
+    state->strategy = Z_DEFAULT_STRATEGY;
+    state->direct = 0;
+    while (*mode) {
+        if (*mode >= '0' && *mode <= '9') {
+            state->level = *mode - '0';
+        } else {
+            switch (*mode) {
+            case 'r':
+                state->mode = GZ_READ;
+                break;
+#ifndef NO_GZCOMPRESS
+            case 'w':
+                state->mode = GZ_WRITE;
+                break;
+            case 'a':
+                state->mode = GZ_APPEND;
+                break;
+#endif
+            case '+':       /* can't read and write at the same time */
+                zng_free(state);
+                return NULL;
+            case 'b':       /* ignore -- will request binary anyway */
+                break;
+#ifdef O_CLOEXEC
+            case 'e':
+                cloexec = 1;
+                break;
+#endif
+#ifdef O_EXCL
+            case 'x':
+                exclusive = 1;
+                break;
+#endif
+            case 'f':
+                state->strategy = Z_FILTERED;
+                break;
+            case 'h':
+                state->strategy = Z_HUFFMAN_ONLY;
+                break;
+            case 'R':
+                state->strategy = Z_RLE;
+                break;
+            case 'F':
+                state->strategy = Z_FIXED;
+                break;
+            case 'T':
+                state->direct = 1;
+                break;
+            default:        /* could consider as an error, but just ignore */
+                {}
+            }
+        }
+        mode++;
+    }
+
+    /* must provide an "r", "w", or "a" */
+    if (state->mode == GZ_NONE) {
+        zng_free(state);
+        return NULL;
+    }
+
+    /* can't force transparent read */
+    if (state->mode == GZ_READ) {
+        if (state->direct) {
+            zng_free(state);
+            return NULL;
+        }
+        state->direct = 1;      /* for empty file */
+    }
+
+    /* save the path name for error messages */
+#ifdef WIDECHAR
+    if (fd == -2) {
+        len = wcstombs(NULL, (const wchar_t *)path, 0);
+        if (len == (size_t)-1)
+            len = 0;
+    } else
+#endif
+        len = strlen((const char *)path);
+    state->path = (char *)malloc(len + 1);
+    if (state->path == NULL) {
+        zng_free(state);
+        return NULL;
+    }
+#ifdef WIDECHAR
+    if (fd == -2)
+        if (len) {
+            wcstombs(state->path, (const wchar_t *)path, len + 1);
+        } else {
+            *(state->path) = 0;
+        }
+    else
+#endif
+        (void)snprintf(state->path, len + 1, "%s", (const char *)path);
+
+    /* compute the flags for open() */
+    oflag =
+#ifdef O_LARGEFILE
+        O_LARGEFILE |
+#endif
+#ifdef O_BINARY
+        O_BINARY |
+#endif
+#ifdef O_CLOEXEC
+        (cloexec ? O_CLOEXEC : 0) |
+#endif
+        (state->mode == GZ_READ ?
+         O_RDONLY :
+         (O_WRONLY | O_CREAT |
+#ifdef O_EXCL
+          (exclusive ? O_EXCL : 0) |
+#endif
+          (state->mode == GZ_WRITE ?
+           O_TRUNC :
+           O_APPEND)));
+
+    /* open the file with the appropriate flags (or just use fd) */
+    state->fd = fd > -1 ? fd : (
+#if defined(_WIN32)
+        fd == -2 ? _wopen((const wchar_t *)path, oflag, 0666) :
+#elif __CYGWIN__
+        fd == -2 ? open(state->path, oflag, 0666) :
+#endif
+        open((const char *)path, oflag, 0666));
+    if (state->fd == -1) {
+        free(state->path);
+        zng_free(state);
+        return NULL;
+    }
+    if (state->mode == GZ_APPEND) {
+        LSEEK(state->fd, 0, SEEK_END);  /* so gzoffset() is correct */
+        state->mode = GZ_WRITE;         /* simplify later checks */
+    }
+
+    /* save the current position for rewinding (only if reading) */
+    if (state->mode == GZ_READ) {
+        state->start = LSEEK(state->fd, 0, SEEK_CUR);
+        if (state->start == -1) state->start = 0;
+    }
+
+    /* initialize stream */
+    gz_reset(state);
+
+    /* return stream */
+    return (gzFile)state;
+}
+
+/* -- see zlib.h -- */
+gzFile Z_EXPORT PREFIX(gzopen)(const char *path, const char *mode) {
+    return gz_open(path, -1, mode);
+}
+
+#ifdef ZLIB_COMPAT
+gzFile Z_EXPORT PREFIX4(gzopen)(const char *path, const char *mode) {
+    return gz_open(path, -1, mode);
+}
+#endif
+
+/* -- see zlib.h -- */
+gzFile Z_EXPORT PREFIX(gzdopen)(int fd, const char *mode) {
+    char *path;         /* identifier for error messages */
+    gzFile gz;
+
+    if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
+        return NULL;
+    (void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd); /* for debugging */
+    gz = gz_open(path, fd, mode);
+    free(path);
+    return gz;
+}
+
+/* -- see zlib.h -- */
+#ifdef WIDECHAR
+gzFile Z_EXPORT PREFIX(gzopen_w)(const wchar_t *path, const char *mode) {
+    return gz_open(path, -2, mode);
+}
+#endif
+
+int Z_EXPORT PREFIX(gzclose)(gzFile file) {
+#ifndef NO_GZCOMPRESS
+    gz_state *state;
+
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+
+    return state->mode == GZ_READ ? PREFIX(gzclose_r)(file) : PREFIX(gzclose_w)(file);
+#else
+    return PREFIX(gzclose_r)(file);
+#endif
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzbuffer)(gzFile file, unsigned size) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* make sure we haven't already allocated memory */
+    if (state->size != 0)
+        return -1;
+
+    /* check and set requested size */
+    if ((size << 1) < size)
+        return -1;              /* need to be able to double it */
+    if (size < 8)
+        size = 8;               /* needed to behave well with flushing */
+    state->want = size;
+    return 0;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzrewind)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* back up and start over */
+    if (LSEEK(state->fd, state->start, SEEK_SET) == -1)
+        return -1;
+    gz_reset(state);
+    return 0;
+}
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gzseek)(gzFile file, z_off64_t offset, int whence) {
+    unsigned n;
+    z_off64_t ret;
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* check that there's no error */
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* can only seek from start or relative to current position */
+    if (whence != SEEK_SET && whence != SEEK_CUR)
+        return -1;
+
+    /* normalize offset to a SEEK_CUR specification */
+    if (whence == SEEK_SET)
+        offset -= state->x.pos;
+    else if (state->seek)
+        offset += state->skip;
+    state->seek = 0;
+
+    /* if within raw area while reading, just go there */
+    if (state->mode == GZ_READ && state->how == COPY && state->x.pos + offset >= 0) {
+        ret = LSEEK(state->fd, offset - (z_off64_t)state->x.have, SEEK_CUR);
+        if (ret == -1)
+            return -1;
+        state->x.have = 0;
+        state->eof = 0;
+        state->past = 0;
+        state->seek = 0;
+        gz_error(state, Z_OK, NULL);
+        state->strm.avail_in = 0;
+        state->x.pos += offset;
+        return state->x.pos;
+    }
+
+    /* calculate skip amount, rewinding if needed for back seek when reading */
+    if (offset < 0) {
+        if (state->mode != GZ_READ)         /* writing -- can't go backwards */
+            return -1;
+        offset += state->x.pos;
+        if (offset < 0)                     /* before start of file! */
+            return -1;
+        if (PREFIX(gzrewind)(file) == -1)   /* rewind, then skip to offset */
+            return -1;
+    }
+
+    /* if reading, skip what's in output buffer (one less gzgetc() check) */
+    if (state->mode == GZ_READ) {
+        n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ? (unsigned)offset : state->x.have;
+        state->x.have -= n;
+        state->x.next += n;
+        state->x.pos += n;
+        offset -= n;
+    }
+
+    /* request skip (if not zero) */
+    if (offset) {
+        state->seek = 1;
+        state->skip = offset;
+    }
+    return state->x.pos + offset;
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gzseek)(gzFile file, z_off_t offset, int whence) {
+    z_off64_t ret;
+
+    ret = PREFIX4(gzseek)(file, (z_off64_t)offset, whence);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gztell)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* return position */
+    return state->x.pos + (state->seek ? state->skip : 0);
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gztell)(gzFile file) {
+
+    z_off64_t ret;
+
+    ret = PREFIX4(gztell)(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gzoffset)(gzFile file) {
+    z_off64_t offset;
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* compute and return effective offset in file */
+    offset = LSEEK(state->fd, 0, SEEK_CUR);
+    if (offset == -1)
+        return -1;
+    if (state->mode == GZ_READ)             /* reading */
+        offset -= state->strm.avail_in;     /* don't count buffered input */
+    return offset;
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gzoffset)(gzFile file) {
+    z_off64_t ret;
+
+    ret = PREFIX4(gzoffset)(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzeof)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return 0;
+
+    /* return end-of-file state */
+    return state->mode == GZ_READ ? state->past : 0;
+}
+
+/* -- see zlib.h -- */
+const char * Z_EXPORT PREFIX(gzerror)(gzFile file, int *errnum) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return NULL;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return NULL;
+
+    /* return error information */
+    if (errnum != NULL)
+        *errnum = state->err;
+    return state->err == Z_MEM_ERROR ? "out of memory" : (state->msg == NULL ? "" : state->msg);
+}
+
+/* -- see zlib.h -- */
+void Z_EXPORT PREFIX(gzclearerr)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return;
+
+    /* clear error and end-of-file */
+    if (state->mode == GZ_READ) {
+        state->eof = 0;
+        state->past = 0;
+    }
+    gz_error(state, Z_OK, NULL);
+}
+
+/* Create an error message in allocated memory and set state->err and
+   state->msg accordingly.  Free any previous error message already there.  Do
+   not try to free or allocate space if the error is Z_MEM_ERROR (out of
+   memory).  Simply save the error message as a static string.  If there is an
+   allocation failure constructing the error message, then convert the error to
+   out of memory. */
+void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) {
+    /* free previously allocated message and clear */
+    if (state->msg != NULL) {
+        if (state->err != Z_MEM_ERROR)
+            free(state->msg);
+        state->msg = NULL;
+    }
+
+    /* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */
+    if (err != Z_OK && err != Z_BUF_ERROR)
+        state->x.have = 0;
+
+    /* set error code, and if no message, then done */
+    state->err = err;
+    if (msg == NULL)
+        return;
+
+    /* for an out of memory error, return literal string when requested */
+    if (err == Z_MEM_ERROR)
+        return;
+
+    /* construct error message with path */
+    if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) == NULL) {
+        state->err = Z_MEM_ERROR;
+        return;
+    }
+    (void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg);
+}
--- a/3rdparty/zlib-ng/gzread.c.in
+++ b/3rdparty/zlib-ng/gzread.c.in
@ -0,0 +1,606 @@
+/* gzread.c -- zlib functions for reading gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "gzguts.h"
+
+/* Local functions */
+static int gz_load(gz_state *, unsigned char *, unsigned, unsigned *);
+static int gz_avail(gz_state *);
+static int gz_look(gz_state *);
+static int gz_decomp(gz_state *);
+static int gz_fetch(gz_state *);
+static int gz_skip(gz_state *, z_off64_t);
+static size_t gz_read(gz_state *, void *, size_t);
+
+/* Use read() to load a buffer -- return -1 on error, otherwise 0.  Read from
+   state->fd, and update state->eof, state->err, and state->msg as appropriate.
+   This function needs to loop on read(), since read() is not guaranteed to
+   read the number of bytes requested, depending on the type of descriptor. */
+static int gz_load(gz_state *state, unsigned char *buf, unsigned len, unsigned *have) {
+    ssize_t ret;
+
+    *have = 0;
+    do {
+        ret = read(state->fd, buf + *have, len - *have);
+        if (ret <= 0)
+            break;
+        *have += (unsigned)ret;
+    } while (*have < len);
+    if (ret < 0) {
+        gz_error(state, Z_ERRNO, zstrerror());
+        return -1;
+    }
+    if (ret == 0)
+        state->eof = 1;
+    return 0;
+}
+
+/* Load up input buffer and set eof flag if last data loaded -- return -1 on
+   error, 0 otherwise.  Note that the eof flag is set when the end of the input
+   file is reached, even though there may be unused data in the buffer.  Once
+   that data has been used, no more attempts will be made to read the file.
+   If strm->avail_in != 0, then the current data is moved to the beginning of
+   the input buffer, and then the remainder of the buffer is loaded with the
+   available data from the input file. */
+static int gz_avail(gz_state *state) {
+    unsigned got;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+    if (state->eof == 0) {
+        if (strm->avail_in) {       /* copy what's there to the start */
+            unsigned char *p = state->in;
+            unsigned const char *q = strm->next_in;
+            unsigned n = strm->avail_in;
+            do {
+                *p++ = *q++;
+            } while (--n);
+        }
+        if (gz_load(state, state->in + strm->avail_in, state->size - strm->avail_in, &got) == -1)
+            return -1;
+        strm->avail_in += got;
+        strm->next_in = state->in;
+    }
+    return 0;
+}
+
+/* Look for gzip header, set up for inflate or copy.  state->x.have must be 0.
+   If this is the first time in, allocate required memory.  state->how will be
+   left unchanged if there is no more input data available, will be set to COPY
+   if there is no gzip header and direct copying will be performed, or it will
+   be set to GZIP for decompression.  If direct copying, then leftover input
+   data from the input buffer will be copied to the output buffer.  In that
+   case, all further file reads will be directly to either the output buffer or
+   a user buffer.  If decompressing, the inflate state will be initialized.
+   gz_look() will return 0 on success or -1 on failure. */
+static int gz_look(gz_state *state) {
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* allocate read buffers and inflate memory */
+    if (state->size == 0) {
+        /* allocate buffers */
+        state->in = (unsigned char *)zng_alloc(state->want);
+        state->out = (unsigned char *)zng_alloc(state->want << 1);
+        if (state->in == NULL || state->out == NULL) {
+            zng_free(state->out);
+            zng_free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        state->size = state->want;
+
+        /* allocate inflate memory */
+        state->strm.zalloc = NULL;
+        state->strm.zfree = NULL;
+        state->strm.opaque = NULL;
+        state->strm.avail_in = 0;
+        state->strm.next_in = NULL;
+        if (PREFIX(inflateInit2)(&(state->strm), MAX_WBITS + 16) != Z_OK) {    /* gunzip */
+            zng_free(state->out);
+            zng_free(state->in);
+            state->size = 0;
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+    }
+
+    /* get at least the magic bytes in the input buffer */
+    if (strm->avail_in < 2) {
+        if (gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0)
+            return 0;
+    }
+
+    /* look for gzip magic bytes -- if there, do gzip decoding (note: there is
+       a logical dilemma here when considering the case of a partially written
+       gzip file, to wit, if a single 31 byte is written, then we cannot tell
+       whether this is a single-byte file, or just a partially written gzip
+       file -- for here we assume that if a gzip file is being written, then
+       the header will be written in a single operation, so that reading a
+       single byte is sufficient indication that it is not a gzip file) */
+    if (strm->avail_in > 1 &&
+            strm->next_in[0] == 31 && strm->next_in[1] == 139) {
+        PREFIX(inflateReset)(strm);
+        state->how = GZIP;
+        state->direct = 0;
+        return 0;
+    }
+
+    /* no gzip header -- if we were decoding gzip before, then this is trailing
+       garbage.  Ignore the trailing garbage and finish. */
+    if (state->direct == 0) {
+        strm->avail_in = 0;
+        state->eof = 1;
+        state->x.have = 0;
+        return 0;
+    }
+
+    /* doing raw i/o, copy any leftover input to output -- this assumes that
+       the output buffer is larger than the input buffer, which also assures
+       space for gzungetc() */
+    state->x.next = state->out;
+    memcpy(state->x.next, strm->next_in, strm->avail_in);
+    state->x.have = strm->avail_in;
+    strm->avail_in = 0;
+    state->how = COPY;
+    state->direct = 1;
+    return 0;
+}
+
+/* Decompress from input to the provided next_out and avail_out in the state.
+   On return, state->x.have and state->x.next point to the just decompressed
+   data.  If the gzip stream completes, state->how is reset to LOOK to look for
+   the next gzip stream or raw data, once state->x.have is depleted.  Returns 0
+   on success, -1 on failure. */
+static int gz_decomp(gz_state *state) {
+    int ret = Z_OK;
+    unsigned had;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* fill output buffer up to end of deflate stream */
+    had = strm->avail_out;
+    do {
+        /* get more input for inflate() */
+        if (strm->avail_in == 0 && gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0) {
+            gz_error(state, Z_BUF_ERROR, "unexpected end of file");
+            break;
+        }
+
+        /* decompress and handle errors */
+        ret = PREFIX(inflate)(strm, Z_NO_FLUSH);
+        if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
+            gz_error(state, Z_STREAM_ERROR, "internal error: inflate stream corrupt");
+            return -1;
+        }
+        if (ret == Z_MEM_ERROR) {
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        if (ret == Z_DATA_ERROR) {              /* deflate stream invalid */
+            gz_error(state, Z_DATA_ERROR, strm->msg == NULL ? "compressed data error" : strm->msg);
+            return -1;
+        }
+    } while (strm->avail_out && ret != Z_STREAM_END);
+
+    /* update available output */
+    state->x.have = had - strm->avail_out;
+    state->x.next = strm->next_out - state->x.have;
+
+    /* if the gzip stream completed successfully, look for another */
+    if (ret == Z_STREAM_END)
+        state->how = LOOK;
+
+    /* good decompression */
+    return 0;
+}
+
+/* Fetch data and put it in the output buffer.  Assumes state->x.have is 0.
+   Data is either copied from the input file or decompressed from the input
+   file depending on state->how.  If state->how is LOOK, then a gzip header is
+   looked for to determine whether to copy or decompress.  Returns -1 on error,
+   otherwise 0.  gz_fetch() will leave state->how as COPY or GZIP unless the
+   end of the input file has been reached and all data has been processed.  */
+static int gz_fetch(gz_state *state) {
+    PREFIX3(stream) *strm = &(state->strm);
+
+    do {
+        switch (state->how) {
+        case LOOK:      /* -> LOOK, COPY (only if never GZIP), or GZIP */
+            if (gz_look(state) == -1)
+                return -1;
+            if (state->how == LOOK)
+                return 0;
+            break;
+        case COPY:      /* -> COPY */
+            if (gz_load(state, state->out, state->size << 1, &(state->x.have))
+                    == -1)
+                return -1;
+            state->x.next = state->out;
+            return 0;
+        case GZIP:      /* -> GZIP or LOOK (if end of gzip stream) */
+            strm->avail_out = state->size << 1;
+            strm->next_out = state->out;
+            if (gz_decomp(state) == -1)
+                return -1;
+        }
+    } while (state->x.have == 0 && (!state->eof || strm->avail_in));
+    return 0;
+}
+
+/* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
+static int gz_skip(gz_state *state, z_off64_t len) {
+    unsigned n;
+
+    /* skip over len bytes or reach end-of-file, whichever comes first */
+    while (len)
+        /* skip over whatever is in output buffer */
+        if (state->x.have) {
+            n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ?
+                (unsigned)len : state->x.have;
+            state->x.have -= n;
+            state->x.next += n;
+            state->x.pos += n;
+            len -= n;
+        } else if (state->eof && state->strm.avail_in == 0) {
+            /* output buffer empty -- return if we're at the end of the input */
+            break;
+        } else {
+            /* need more data to skip -- load up output buffer */
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return -1;
+        }
+    return 0;
+}
+
+/* Read len bytes into buf from file, or less than len up to the end of the
+   input.  Return the number of bytes read.  If zero is returned, either the
+   end of file was reached, or there was an error.  state->err must be
+   consulted in that case to determine which. */
+static size_t gz_read(gz_state *state, void *buf, size_t len) {
+    size_t got;
+    unsigned n;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return 0;
+    }
+
+    /* get len bytes to buf, or less than len if at the end */
+    got = 0;
+    do {
+        /* set n to the maximum amount of len that fits in an unsigned int */
+        n = (unsigned)-1;
+        if (n > len)
+            n = (unsigned)len;
+
+        /* first just try copying data from the output buffer */
+        if (state->x.have) {
+            if (state->x.have < n)
+                n = state->x.have;
+            memcpy(buf, state->x.next, n);
+            state->x.next += n;
+            state->x.have -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state->eof && state->strm.avail_in == 0) {
+            state->past = 1;        /* tried to read past end */
+            break;
+        }
+
+        /* need output data -- for small len or new stream load up our output
+           buffer */
+        else if (state->how == LOOK || n < (state->size << 1)) {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return 0;
+            continue;       /* no progress yet -- go back to copy above */
+            /* the copy above assures that we will leave with space in the
+               output buffer, allowing at least one gzungetc() to succeed */
+        }
+
+        /* large len -- read directly into user buffer */
+        else if (state->how == COPY) {      /* read directly */
+            if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
+                return 0;
+        }
+
+        /* large len -- decompress directly into user buffer */
+        else {  /* state->how == GZIP */
+            state->strm.avail_out = n;
+            state->strm.next_out = (unsigned char *)buf;
+            if (gz_decomp(state) == -1)
+                return 0;
+            n = state->x.have;
+            state->x.have = 0;
+        }
+
+        /* update progress */
+        len -= n;
+        buf = (char *)buf + n;
+        got += n;
+        state->x.pos += n;
+    } while (len);
+
+    /* return number of bytes read into user buffer */
+    return got;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzread)(gzFile file, void *buf, unsigned len) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
+        return -1;
+    }
+
+    /* read len or fewer bytes to buf */
+    len = (unsigned)gz_read(state, buf, len);
+
+    /* check for an error */
+    if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* return the number of bytes read (this is assured to fit in an int) */
+    return (int)len;
+}
+
+/* -- see zlib.h -- */
+size_t Z_EXPORT PREFIX(gzfread)(void *buf, size_t size, size_t nitems, gzFile file) {
+    size_t len;
+    gz_state *state;
+
+    /* Exit early if size is zero, also prevents potential division by zero */
+    if (size == 0)
+        return 0;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    if (size && SIZE_MAX / size < nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+    len = nitems * size;
+
+    /* read len or fewer bytes to buf, return the number of full items read */
+    return len ? gz_read(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+#undef @ZLIB_SYMBOL_PREFIX@gzgetc
+#undef @ZLIB_SYMBOL_PREFIX@zng_gzgetc
+int Z_EXPORT PREFIX(gzgetc)(gzFile file) {
+    unsigned char buf[1];
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* try output buffer (no need to check for skip request) */
+    if (state->x.have) {
+        state->x.have--;
+        state->x.pos++;
+        return *(state->x.next)++;
+    }
+
+    /* nothing there -- try gz_read() */
+    return gz_read(state, buf, 1) < 1 ? -1 : buf[0];
+}
+
+#ifdef ZLIB_COMPAT
+int Z_EXPORT PREFIX(gzgetc_)(gzFile file) {
+    return PREFIX(gzgetc)(file);
+}
+#endif
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzungetc)(int c, gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* in case this was just opened, set up the input buffer */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return -1;
+    }
+
+    /* can't push EOF */
+    if (c < 0)
+        return -1;
+
+    /* if output buffer empty, put byte at end (allows more pushing) */
+    if (state->x.have == 0) {
+        state->x.have = 1;
+        state->x.next = state->out + (state->size << 1) - 1;
+        state->x.next[0] = (unsigned char)c;
+        state->x.pos--;
+        state->past = 0;
+        return c;
+    }
+
+    /* if no room, give up (must have already done a gzungetc()) */
+    if (state->x.have == (state->size << 1)) {
+        gz_error(state, Z_DATA_ERROR, "out of room to push characters");
+        return -1;
+    }
+
+    /* slide output data if needed and insert byte before existing data */
+    if (state->x.next == state->out) {
+        unsigned char *src = state->out + state->x.have;
+        unsigned char *dest = state->out + (state->size << 1);
+        while (src > state->out)
+            *--dest = *--src;
+        state->x.next = dest;
+    }
+    state->x.have++;
+    state->x.next--;
+    state->x.next[0] = (unsigned char)c;
+    state->x.pos--;
+    state->past = 0;
+    return c;
+}
+
+/* -- see zlib.h -- */
+char * Z_EXPORT PREFIX(gzgets)(gzFile file, char *buf, int len) {
+    unsigned left, n;
+    char *str;
+    unsigned char *eol;
+    gz_state *state;
+
+    /* check parameters and get internal structure */
+    if (file == NULL || buf == NULL || len < 1)
+        return NULL;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return NULL;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return NULL;
+    }
+
+    /* copy output bytes up to new line or len - 1, whichever comes first --
+       append a terminating zero to the string (we don't check for a zero in
+       the contents, let the user worry about that) */
+    str = buf;
+    left = (unsigned)len - 1;
+    if (left) {
+        do {
+            /* assure that something is in the output buffer */
+            if (state->x.have == 0 && gz_fetch(state) == -1)
+                return NULL;                /* error */
+            if (state->x.have == 0) {       /* end of file */
+                state->past = 1;            /* read past end */
+                break;                      /* return what we have */
+            }
+
+            /* look for end-of-line in current output buffer */
+            n = state->x.have > left ? left : state->x.have;
+            eol = (unsigned char *)memchr(state->x.next, '\n', n);
+            if (eol != NULL)
+                n = (unsigned)(eol - state->x.next) + 1;
+
+            /* copy through end-of-line, or remainder if not found */
+            memcpy(buf, state->x.next, n);
+            state->x.have -= n;
+            state->x.next += n;
+            state->x.pos += n;
+            left -= n;
+            buf += n;
+        } while (left && eol == NULL);
+    }
+
+    /* return terminated string, or if nothing, end of file */
+    if (buf == str)
+        return NULL;
+    buf[0] = 0;
+    return str;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzdirect)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+
+    state = (gz_state *)file;
+
+    /* if the state is not known, but we can find out, then do so (this is
+       mainly for right after a gzopen() or gzdopen()) */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
+    /* return 1 if transparent, 0 if processing a gzip stream */
+    return state->direct;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzclose_r)(gzFile file) {
+    int ret, err;
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+
+    state = (gz_state *)file;
+
+    /* check that we're reading */
+    if (state->mode != GZ_READ)
+        return Z_STREAM_ERROR;
+
+    /* free memory and close file */
+    if (state->size) {
+        PREFIX(inflateEnd)(&(state->strm));
+        zng_free(state->out);
+        zng_free(state->in);
+    }
+    err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
+    gz_error(state, Z_OK, NULL);
+    free(state->path);
+    ret = close(state->fd);
+    zng_free(state);
+    return ret ? Z_ERRNO : err;
+}
--- a/Show More
+++ b/Show More