Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1

Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2 months ago · 85923c8f30
parent 7de3a8e960
commit 85923c8f30
132 changed files with 7213 additions and 1750 deletions
--- a/3rdparty/zlib-ng/CMakeLists.txt
+++ b/3rdparty/zlib-ng/CMakeLists.txt
--- a/3rdparty/zlib-ng/LICENSE.md
+++ b/3rdparty/zlib-ng/LICENSE.md
@ -1,4 +1,4 @@
-(C) 1995-2013 Jean-loup Gailly and Mark Adler
+(C) 1995-2024 Jean-loup Gailly and Mark Adler

 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
--- a/3rdparty/zlib-ng/README.md
+++ b/3rdparty/zlib-ng/README.md
@ -21,7 +21,6 @@ Features
 * Support for CPU intrinsics when available
  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
-  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
  * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@ -95,20 +94,21 @@ make test
 Build Options
 -------------

-| CMake                    | configure                | Description                                                                           | Default |
-|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
-| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
-| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
-| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
-| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
-| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
-| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
-| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
-| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
-| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
-| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
-| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
-| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+| CMake                      | configure                | Description                                                                         | Default |
+|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT                | --zlib-compat            | Compile with zlib compatible API                                                    | OFF     |
+| ZLIB_ENABLE_TESTS          |                          | Build test binaries                                                                 | ON      |
+| WITH_GZFILEOP              | --without-gzfileops      | Compile with support for gzFile related functions                                   | ON      |
+| WITH_OPTIM                 | --without-optimizations  | Build with optimisations                                                            | ON      |
+| WITH_NEW_STRATEGIES        | --without-new-strategies | Use new strategies                                                                  | ON      |
+| WITH_NATIVE_INSTRUCTIONS   |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF     |
+| WITH_RUNTIME_CPU_DETECTION |                          | Compiles with runtime CPU detection                                                 | ON      |
+| WITH_SANITIZER             |                          | Build with sanitizer (memory, address, undefined)                                   | OFF     |
+| WITH_GTEST                 |                          | Build gtest_zlib                                                                    | ON      |
+| WITH_FUZZERS               |                          | Build test/fuzz                                                                     | OFF     |
+| WITH_BENCHMARKS            |                          | Build test/benchmarks                                                               | OFF     |
+| WITH_MAINTAINER_WARNINGS   |                          | Build with project maintainer warnings                                              | OFF     |
+| WITH_CODE_COVERAGE         |                          | Enable code coverage reporting                                                      | OFF     |


 Install
--- a/3rdparty/zlib-ng/adler32.c
+++ b/3rdparty/zlib-ng/adler32.c
@ -7,70 +7,24 @@
 #include "functable.h"
 #include "adler32_p.h"

-/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
-    uint32_t sum2;
-    unsigned n;
-
-    /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
-
-    /* do length NMAX blocks -- requires just one modulo operation */
-    while (len >= NMAX) {
-        len -= NMAX;
-#ifdef UNROLL_MORE
-        n = NMAX / 16;          /* NMAX is divisible by 16 */
-#else
-        n = NMAX / 8;           /* NMAX is divisible by 8 */
-#endif
-        do {
-#ifdef UNROLL_MORE
-            DO16(adler, sum2, buf);          /* 16 sums unrolled */
-            buf += 16;
-#else
-            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
-            buf += 8;
-#endif
-        } while (--n);
-        adler %= BASE;
-        sum2 %= BASE;
-    }
-
-    /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_len_64(adler, buf, len, sum2);
-}
-
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

 /* ========================================================================= */
 #ifdef ZLIB_COMPAT
 unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+    return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
 }
 #else
 uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
-    return functable.adler32(adler, buf, len);
+    return FUNCTABLE_CALL(adler32)(adler, buf, len);
 }
 #endif

--- a/3rdparty/zlib-ng/adler32_fold.h
+++ b/3rdparty/zlib-ng/adler32_fold.h
@ -1,11 +0,0 @@
-/* adler32_fold.h -- adler32 folding interface
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_FOLD_H_
-#define ADLER32_FOLD_H_
-
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-
-#endif
--- a/3rdparty/zlib-ng/arch/.gitignore
+++ b/3rdparty/zlib-ng/arch/.gitignore
@ -1,2 +0,0 @@
-# ignore Makefiles; they're all automatically generated
-Makefile
--- a/3rdparty/zlib-ng/arch/arm/Makefile.in
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@ -25,7 +25,6 @@ all: \
 	crc32_acle.o crc32_acle.lo \
 	slide_hash_neon.o slide_hash_neon.lo \
 	slide_hash_armv6.o slide_hash_armv6.lo \
-	insert_string_acle.o insert_string_acle.lo

 adler32_neon.o:
 	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@ -69,12 +68,6 @@ slide_hash_armv6.o:
 slide_hash_armv6.lo:
 	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c

-insert_string_acle.o:
-	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
-insert_string_acle.lo:
-	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
-
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
--- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@ -7,8 +7,8 @@
 */
 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
    static const uint16_t ALIGNED_(16) taps[64] = {
--- a/3rdparty/zlib-ng/arch/arm/arm_features.c
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@ -1,4 +1,4 @@
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "arm_features.h"

 #if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@ -11,6 +11,11 @@
 #  ifndef ID_AA64ISAR0_CRC32_VAL
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
 #elif defined(__APPLE__)
 #  if !defined(_DARWIN_C_SOURCE)
 #    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@ -30,6 +35,16 @@ static int arm_has_crc32() {
 #elif defined(__FreeBSD__) && defined(__aarch64__)
    return getenv("QEMU_EMULATING") == NULL
      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(__aarch64__)
+    int hascrc32 = 0;
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
+          hascrc32 = 1;
+    }
+    return hascrc32;
 #elif defined(__APPLE__)
    int hascrc32;
    size_t size = sizeof(hascrc32);
--- a/3rdparty/zlib-ng/arch/arm/arm_features.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@ -2,8 +2,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef ARM_H_
-#define ARM_H_
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_

 struct arm_cpu_features {
    int has_simd;
@ -13,4 +13,4 @@ struct arm_cpu_features {

 void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);

-#endif /* ARM_H_ */
+#endif /* ARM_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/arm/arm_functions.h
+++ b/3rdparty/zlib-ng/arch/arm/arm_functions.h
@ -0,0 +1,65 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_neon(void);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZLL
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#  endif
+void slide_hash_neon(deflate_state *s);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef ARM_ACLE
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_chunksize
+#    define native_chunksize chunksize_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#    ifdef HAVE_BUILTIN_CTZLL
+#      undef native_compare256
+#      define native_compare256 compare256_neon
+#      undef native_longest_match
+#      define native_longest_match longest_match_neon
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_neon
+#    endif
+#  endif
+// ARM - ACLE
+#  if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
+#    undef native_crc32
+#    define native_crc32 crc32_acle
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@ -4,8 +4,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../generic/chunk_permute_table.h"
+#include "zbuild.h"
+#include "arch/generic/chunk_permute_table.h"

 typedef uint8x16_t chunk_t;

--- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
--- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@ -7,7 +7,7 @@

 #ifdef ARM_ACLE
 #include "acle_intrins.h"
-#include "../../zbuild.h"
+#include "zbuild.h"

 Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
    Z_REGISTER uint32_t c;
--- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
+++ b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
@ -1,24 +0,0 @@
-/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef ARM_ACLE
-#include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val) \
-    h = __crc32w(0, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
-#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
-#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
-
-#include "../../insert_string_tpl.h"
-#endif
--- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@ -25,6 +25,13 @@
    out.val[3] = vqsubq_u16(a.val[3], b); \
 } while (0)

+#  if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
+/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif

 #  ifndef ARM_NEON_HASLD4

--- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@ -5,8 +5,8 @@

 #if defined(ARM_SIMD)
 #include "acle_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@ -10,8 +10,8 @@

 #ifdef ARM_NEON
 #include "neon_intrins.h"
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
--- a/3rdparty/zlib-ng/arch/generic/Makefile.in
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@ -1,5 +1,6 @@
-# Makefile for zlib
+# Makefile for zlib-ng
 # Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
 # For conditions of distribution and use, see copyright notice in zlib.h

 CC=
@ -11,12 +12,62 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all:
+all: \
+ adler32_c.o adler32_c.lo \
+ adler32_fold_c.o adler32_fold_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_fold_c.o crc32_fold_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c


 mostlyclean: clean
 clean:
-	rm -f *.o *.lo *~ \
+	rm -f *.o *.lo *~
 	rm -rf objs
 	rm -f *.gcda *.gcno *.gcov

--- a/3rdparty/zlib-ng/arch/generic/adler32_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_c.c
@ -0,0 +1,54 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
--- a/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c
@ -5,12 +5,11 @@

 #include "zbuild.h"
 #include "functable.h"
-#include "adler32_fold.h"

 #include <limits.h>

 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-    adler = functable.adler32(adler, src, len);
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
    memcpy(dst, src, len);
    return adler;
 }
--- a/3rdparty/zlib-ng/arch/generic/chunkset_c.c
+++ b/3rdparty/zlib-ng/arch/generic/chunkset_c.c
--- a/3rdparty/zlib-ng/arch/generic/compare256_c.c
+++ b/3rdparty/zlib-ng/arch/generic/compare256_c.c
@ -5,6 +5,7 @@

 #include "zbuild.h"
 #include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 /* ALIGNED, byte comparison */
--- a/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c
@ -8,43 +8,9 @@
 */

 #include "zbuild.h"
-#include "zutil.h"
-#include "functable.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"

-/* ========================================================================= */
-
-const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
-    return (const uint32_t *)crc_table;
-}
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 0;
-
-    return functable.crc32(crc, buf, len);
-}
-#endif
-
-#ifdef ZLIB_COMPAT
-unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
-    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
-}
-#else
-uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
-    return PREFIX(crc32_z)(crc, buf, len);
-}
-#endif
-
-/* ========================================================================= */
-
 /*
  A CRC of a message is computed on N braids of words in the message, where
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
  level. Your mileage may vary.
 */

-/* ========================================================================= */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define ZSWAPWORD(word) (word)
-#  define BRAID_TABLE crc_braid_table
-#elif BYTE_ORDER == BIG_ENDIAN
-#  if W == 8
-#    define ZSWAPWORD(word) ZSWAP64(word)
-#  elif W == 4
-#    define ZSWAPWORD(word) ZSWAP32(word)
-#  endif
-#  define BRAID_TABLE crc_braid_big_table
-#else
-#  error "No endian defined"
-#endif
-#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
 /* ========================================================================= */
 #ifdef W
 /*
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {

 /* ========================================================================= */
 Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
-    Z_REGISTER uint32_t c;
+    uint32_t c;

    /* Pre-condition the CRC */
    c = (~crc) & 0xffffffff;
--- a/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
+++ b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c
@ -3,11 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
 #include "zbuild.h"
+#include "zutil.h"
 #include "functable.h"
-
-#include "crc32_fold.h"
-
-#include <limits.h>
+#include "crc32.h"

 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
    crc->value = CRC32_INITIAL_VALUE;
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
 }

 Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
    memcpy(dst, src, len);
 }

@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
     * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
     * init_crc is an unused argument in this context */
    Z_UNUSED(init_crc);
-    crc->value = functable.crc32(crc->value, src, len);
+    crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
 }

 Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
--- a/3rdparty/zlib-ng/arch/generic/generic_functions.h
+++ b/3rdparty/zlib-ng/arch/generic/generic_functions.h
@ -0,0 +1,106 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+
+uint32_t chunksize_c(void);
+uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#  endif
+#endif
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+void     slide_hash_c(deflate_state *s);
+
+uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#    ifdef HAVE_BUILTIN_CTZ
+        uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#    endif
+#    if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+        uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#  if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+    uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#    ifdef UNALIGNED64_OK
+        uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#    endif
+#  endif
+
+
+// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define longest_match_generic longest_match_unaligned_64
+#    define longest_match_slow_generic longest_match_slow_unaligned_64
+#    define compare256_generic compare256_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define longest_match_generic longest_match_unaligned_32
+#    define longest_match_slow_generic longest_match_slow_unaligned_32
+#    define compare256_generic compare256_unaligned_32
+#  else
+#    define longest_match_generic longest_match_unaligned_16
+#    define longest_match_slow_generic longest_match_slow_unaligned_16
+#    define compare256_generic compare256_unaligned_16
+#  endif
+#else
+#  define longest_match_generic longest_match_c
+#  define longest_match_slow_generic longest_match_slow_c
+#  define compare256_generic compare256_c
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_fold_copy adler32_fold_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#  define native_chunksize chunksize_c
+#  define native_crc32 PREFIX(crc32_braid)
+#  define native_crc32_fold crc32_fold_c
+#  define native_crc32_fold_copy crc32_fold_copy_c
+#  define native_crc32_fold_final crc32_fold_final_c
+#  define native_crc32_fold_reset crc32_fold_reset_c
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_generic
+#  define native_longest_match_slow longest_match_slow_generic
+#  define native_compare256 compare256_generic
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
+++ b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c
@ -1,6 +1,6 @@
 /* slide_hash.c -- slide hash table C implementation
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

--- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@ -4,7 +4,7 @@

 #ifdef POWER8_VSX
 #include <altivec.h>
-#include "../../zbuild.h"
+#include "zbuild.h"

 typedef vector unsigned char chunk_t;

--- a/3rdparty/zlib-ng/arch/power/compare256_power9.c
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@ -5,8 +5,10 @@

 #ifdef POWER9
 #include <altivec.h>
-#include "../../zbuild.h"
-#include "../../zendian.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "zendian.h"

 /* Older versions of GCC misimplemented semantics for these bit counting builtins.
 * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
--- a/3rdparty/zlib-ng/arch/power/power_features.c
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@ -1,16 +1,19 @@
 /* power_features.c - POWER feature check
 * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
- * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

 #ifdef HAVE_SYS_AUXV_H
 #  include <sys/auxv.h>
 #endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
 #ifdef __FreeBSD__
 #  include <machine/cpu.h>
 #endif
-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "power_features.h"

 void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
--- a/3rdparty/zlib-ng/arch/power/power_features.h
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@ -4,8 +4,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef POWER_H_
-#define POWER_H_
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_

 struct power_cpu_features {
    int has_altivec;
@ -15,4 +15,4 @@ struct power_cpu_features {

 void Z_INTERNAL power_check_features(struct power_cpu_features *features);

-#endif /* POWER_H_ */
+#endif /* POWER_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/power/power_functions.h
+++ b/3rdparty/zlib-ng/arch/power/power_functions.h
@ -0,0 +1,67 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t chunksize_power8(void);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  if defined(PPC_VMX) && defined(__ALTIVEC__)
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_chunksize
+#    define native_chunksize chunksize_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#  endif
+// Power9
+#  if defined(POWER9) && defined(_ARCH_PWR9)
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@ -9,8 +9,8 @@
 #include <riscv_vector.h>
 #include <stdint.h>

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"

 static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
    /* split Adler-32 into component sums */
--- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@ -6,7 +6,9 @@

 #ifdef RISCV_RVV

-#include "../../zbuild.h"
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #include <riscv_vector.h>
--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@ -1,10 +1,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/auxv.h>
 #include <sys/utsname.h>

-#include "../../zbuild.h"
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#endif
+
+#include "zbuild.h"
 #include "riscv_features.h"

 #define ISA_V_HWCAP (1 << ('v' - 'a'))
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
 }

 void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
    unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+    unsigned long hw_cap = 0;
+#endif
    features->has_rvv = hw_cap & ISA_V_HWCAP;
 }

--- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@ -6,8 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#ifndef RISCV_H_
-#define RISCV_H_
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_

 struct riscv_cpu_features {
    int has_rvv;
@ -15,4 +15,4 @@ struct riscv_cpu_features {

 void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);

-#endif /* RISCV_H_ */
+#endif /* RISCV_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h
@ -0,0 +1,49 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_rvv(void);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+#  if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
+#    undef native_adler32
+#    define native_adler32 adler32_rvv
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_rvv
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_rvv
+#    undef native_chunksize
+#    define native_chunksize chunksize_rvv
+#    undef native_compare256
+#    define native_compare256 compare256_rvv
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_rvv
+#    undef native_longest_match
+#    define native_longest_match longest_match_rvv
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_rvv
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_rvv
+#  endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@ -8,18 +8,16 @@

 #include <riscv_vector.h>

-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
    size_t vl;
    while (entries > 0) {
        vl = __riscv_vsetvl_e16m4(entries);
        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
-        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
-        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
-        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
-        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+        __riscv_vse16_v_u16m4(table, v_diff, vl);
        table += vl, entries -= vl;
    }
 }
--- a/3rdparty/zlib-ng/arch/s390/Makefile.in
+++ b/3rdparty/zlib-ng/arch/s390/Makefile.in
@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
--- a/3rdparty/zlib-ng/arch/s390/README.md
+++ b/3rdparty/zlib-ng/arch/s390/README.md
@ -0,0 +1,277 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+
+```
+sudo dnf install podman
+```
+
+### Add actions-runner service.
+
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Create a config file, needs github personal access token.
+
+```
+# Create file /etc/actions-runner
+repo=<owner>/<name>
+access_token=<ghp_***>
+```
+
+Access token should have the repo scope, consult
+https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
+for details.
+
+### Autostart actions-runner.
+
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+## Rebuilding the container
+
+In order to update the `gaplib-actions-runner` podman container, e.g. to get the
+latest OS security fixes, follow these steps:
+```
+# Stop actions-runner service
+sudo systemctl stop actions-runner
+
+# Delete old container
+sudo podman container rm gaplib-actions-runner
+
+# Delete old image
+sudo podman image rm localhost/zlib-ng/actions-runner
+
+# Build image
+sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
+
+# Build container
+sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
+
+# Start actions-runner service
+sudo systemctl start actions-runner
+```
--- a/3rdparty/zlib-ng/arch/s390/crc32-vx.c
+++ b/3rdparty/zlib-ng/arch/s390/crc32-vx.c
@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return PREFIX(crc32_braid)(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = PREFIX(crc32_braid)(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
+
+    return crc;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h
@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+} ALIGNED_(8);
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+   Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+typedef struct {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+    struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+   History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+   Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
@ -0,0 +1,383 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
+    PREFIX(flush_pending)(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(state->crc_fold.value);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        state->crc_fold.value = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
@ -0,0 +1,275 @@
+#include "zbuild.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+    *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+    *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    instrument_read_write(hist + param->ho, hl_high);
+    instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    __msan_unpoison(hist + param->ho, hl_high);
+    __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+    unsigned char *orig_t2 = t2;
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0");
+    Z_REGISTER void *r1 __asm__("r1");
+    Z_REGISTER unsigned char *r2 __asm__("r2");
+    Z_REGISTER size_t r3 __asm__("r3");
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+    Z_REGISTER size_t r5 __asm__("r5");
+    int cc;
+
+    /* Insert pre-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        instrument_write(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+        instrument_read(t4, t5);
+        break;
+    case DFLTCC_CMPR:
+    case DFLTCC_XPND:
+        instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        instrument_read(t4, t5);
+        instrument_write(t2, t3);
+        instrument_read_write_hist(param, hist);
+        break;
+    }
+
+    r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+    /* Insert post-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        msan_unpoison_hist(param, hist);
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        msan_unpoison_hist(param, hist);
+        break;
+    }
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    memcpy(buf, history + param->ho, hl_high);
+    memcpy(buf + hl_high, history, hl_low);
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
@ -0,0 +1,191 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    return !state->arch.common.param.nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
--- a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
+++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/s390_features.c
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.c
@ -0,0 +1,14 @@
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS HWCAP_S390_VX
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
--- a/3rdparty/zlib-ng/arch/s390/s390_features.h
+++ b/3rdparty/zlib-ng/arch/s390/s390_features.h
@ -0,0 +1,14 @@
+/* s390_features.h -- check for s390 features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+    int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/s390_functions.h
+++ b/3rdparty/zlib-ng/arch/s390/s390_functions.h
@ -0,0 +1,20 @@
+/* s390_functions.h -- s390 implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FUNCTIONS_H_
+#define S390_FUNCTIONS_H_
+
+#ifdef S390_CRC32_VX
+uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+#  if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+#    undef native_crc32
+#    define native_crc32 = crc32_s390_vx
+#  endif
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@ -0,0 +1,47 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+FROM    almalinux:9
+
+RUN     dnf update -y -q && \
+        dnf install -y -q --enablerepo=crb wget git which sudo jq \
+            cmake make automake autoconf m4 libtool ninja-build python3-pip \
+            gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
+            glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
+
+RUN     dnf install -y -q dotnet-sdk-6.0 && \
+        echo "Using SDK - `dotnet --version`"
+
+COPY    runner-s390x.patch /tmp/runner.patch
+COPY    runner-global.json /tmp/global.json
+
+RUN     cd /tmp && \
+        git clone -q https://github.com/actions/runner && \
+        cd runner && \
+        git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
+        git apply /tmp/runner.patch && \
+        cp -f /tmp/global.json src/global.json
+
+
+RUN     cd /tmp/runner/src && \
+        ./dev.sh layout && \
+        ./dev.sh package && \
+        rm -rf /root/.dotnet /root/.nuget
+
+RUN     useradd -c "Action Runner" -m actions-runner && \
+        usermod -L actions-runner
+
+RUN     tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
+        chown -R actions-runner:actions-runner /home/actions-runner
+
+#VOLUME  /home/actions-runner
+
+RUN     rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
+        dnf clean all
+
+USER    actions-runner
+
+# Scripts.
+COPY    fs/ /
+WORKDIR /home/actions-runner
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD     ["/usr/bin/actions-runner"]
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
@ -0,0 +1,18 @@
+[Unit]
+Description=Podman container: Gaplib Github Actions Runner
+Wants=network-online.target
+After=network-online.target
+StartLimitIntervalSec=1
+RequiresMountsFor=/run/user/1001/containers
+
+[Service]
+Environment=PODMAN_SYSTEMD_UNIT=%n
+Restart=always
+TimeoutStopSec=61
+ExecStart=/usr/bin/podman start gaplib-actions-runner
+ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
+ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
+Type=forking
+
+[Install]
+WantedBy=default.target
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
@ -0,0 +1,5 @@
+{
+  "sdk": {
+    "version": "6.0.421"
+  }
+}
--- a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
+++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
@ -0,0 +1,243 @@
+diff --git a/src/Directory.Build.props b/src/Directory.Build.props
+index 9db5fac..f02e235 100644
+--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
+@@ -44,6 +44,9 @@
+   <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
+     <DefineConstants>$(DefineConstants);ARM64</DefineConstants>
+   </PropertyGroup>
+  <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
+    <DefineConstants>$(DefineConstants);S390X</DefineConstants>
+  </PropertyGroup>
+ 
+   <!-- Set TRACE/DEBUG vars -->
+   <PropertyGroup>
+diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
+index 383221e..1555f67 100755
+--- a/src/Misc/externals.sh
+++ b/src/Misc/externals.sh
+@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
+     acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
+     acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
+ fi
+
+if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
+    acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
+    acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
+fi
+diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
+index 14cc6ba..9b5b8e6 100755
+--- a/src/Misc/layoutroot/config.sh
+++ b/src/Misc/layoutroot/config.sh
+@@ -20,25 +20,29 @@ then
+ 
+     message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
+ 
+-    ldd ./bin/libcoreclr.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
+    ARCH=`uname -m`
+    if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
+    then
+        ldd ./bin/libcoreclr.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+ 
+-    ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+-    fi
+        ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+ 
+-    ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+-    if [ $? -eq 0 ]; then
+-        echo "Dependencies is missing for Dotnet Core 6.0"
+-        echo $message
+-        exit 1
+        ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+        if [ $? -eq 0 ]; then
+            echo "Dependencies is missing for Dotnet Core 6.0"
+            echo $message
+            exit 1
+        fi
+     fi
+ 
+     if ! [ -x "$(command -v ldconfig)" ]; then
+diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
+index 177e3c9..9545981 100644
+--- a/src/Runner.Common/Constants.cs
+++ b/src/Runner.Common/Constants.cs
+@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
+             X86,
+             X64,
+             Arm,
+-            Arm64
+            Arm64,
+	    S390x
+         }
+ 
+         public static class Runner
+@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm;
+ #elif ARM64
+             public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
+#elif S390X
+            public static readonly Architecture PlatformArchitecture = Architecture.S390x;
+ #else
+             public static readonly Architecture PlatformArchitecture = Architecture.X64;
+ #endif
+diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
+index 97273a1..2a34430 100644
+--- a/src/Runner.Common/Util/VarUtil.cs
+++ b/src/Runner.Common/Util/VarUtil.cs
+@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
+                         return "ARM";
+                     case Constants.Architecture.Arm64:
+                         return "ARM64";
+                    case Constants.Architecture.S390x:
+                        return "S390X";
+                     default:
+                         throw new NotSupportedException(); // Should never reach here.
+                 }
+diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
+index 2042485..a9d8b46 100644
+--- a/src/Test/L0/ConstantGenerationL0.cs
+++ b/src/Test/L0/ConstantGenerationL0.cs
+@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
+                 "linux-x64",
+                 "linux-arm",
+                 "linux-arm64",
+                "linux-s390x",
+                 "osx-x64",
+                 "osx-arm64"
+             };
+diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
+index 26ba65e..6791df3 100644
+--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
+@@ -1,4 +1,4 @@
+-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+@@ -16,6 +16,7 @@ using Xunit;
+ 
+ namespace GitHub.Runner.Common.Tests.Listener
+ {
+#if !S390X // Self-update is not currently supported on S390X
+     public sealed class SelfUpdaterL0
+     {
+         private Mock<IRunnerServer> _runnerServer;
+@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
+             }
+         }
+     }
+#endif
+ }
+ #endif
+diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+index 5115a6b..dd8d198 100644
+--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+@@ -1,4 +1,4 @@
+-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
+ using System;
+ using System.Collections.Generic;
+ using System.IO;
+diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
+index f6b5889..26f8e21 100644
+--- a/src/Test/L0/Worker/StepHostL0.cs
+++ b/src/Test/L0/Worker/StepHostL0.cs
+@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
+             return hc;
+         }
+ 
+-#if OS_LINUX
+#if OS_LINUX && !S390X
+         [Fact]
+         [Trait("Level", "L0")]
+         [Trait("Category", "Worker")]
+diff --git a/src/dev.sh b/src/dev.sh
+index fa637d1..8c66f37 100755
+--- a/src/dev.sh
+++ b/src/dev.sh
+@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+         case $CPU_NAME in
+             armv7l) RUNTIME_ID="linux-arm";;
+             aarch64) RUNTIME_ID="linux-arm64";;
+            s390x) RUNTIME_ID="linux-s390x";;
+         esac
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
+@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
+         exit 1
+     fi
+ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
+-    if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
+	if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm')  && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
+        echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
+        exit 1
+     fi
+@@ -199,7 +200,8 @@ function package ()
+     popd > /dev/null
+ }
+ 
+-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
+if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
+
+ 
+     # Download dotnet SDK to ../_dotnetsdk directory
+     heading "Ensure Dotnet SDK"
+@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
+     echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
+ fi
+ 
+-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
+    echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+    export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+fi
+ 
+ heading "Dotnet SDK Version"
+ dotnet --version
+diff --git a/src/dir.proj b/src/dir.proj
+index 056a312..8370922 100644
+--- a/src/dir.proj
+++ b/src/dir.proj
+@@ -41,8 +41,18 @@
+     </ItemGroup>
+ 
+     <Target Name="Build" DependsOnTargets="GenerateConstant">
+-        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
+-        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+        <PropertyGroup>
+            <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
+            <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
+            <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
+                 Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
+                 In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
+                 as this would confuse the expected layout, disable that behavior as well.  -->
+            <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
+        </PropertyGroup>
+
+        <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
+        <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+         <Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
+     </Target>
+
--- a/3rdparty/zlib-ng/arch/x86/Makefile.in
+++ b/3rdparty/zlib-ng/arch/x86/Makefile.in
@ -35,7 +35,6 @@ all: \
 	chunkset_ssse3.o chunkset_ssse3.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_sse2.o compare256_sse2.lo \
-	insert_string_sse42.o insert_string_sse42.lo \
 	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
 	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
 	slide_hash_avx2.o slide_hash_avx2.lo \
@ -77,12 +76,6 @@ compare256_sse2.o:
 compare256_sse2.lo:
 	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c

-insert_string_sse42.o:
-	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
-insert_string_sse42.lo:
-	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-
 crc32_pclmulqdq.o:
 	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c

@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
 	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c

 crc32_vpclmulqdq.o:
-	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c

 crc32_vpclmulqdq.lo:
-	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c

 slide_hash_avx2.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
@ -9,24 +9,15 @@

 #ifdef X86_AVX2

-#include "../../zbuild.h"
+#include "zbuild.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
+#include "adler32_p.h"
 #include "adler32_avx2_p.h"
 #include "x86_intrins.h"

-#ifdef X86_SSE42
 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);

-#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
-#define sub32(a, b, c) adler32_ssse3(a, b, c)
-#else
-#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
-#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
-#endif
-
 static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
    if (src == NULL) return 1L;
    if (len == 0) return adler;
@ -44,9 +35,9 @@ rem_peel:
        }
    } else if (len < 32) {
        if (COPY) {
-            return copy_sub32(adler, dst, src, len);
+            return adler32_fold_copy_sse42(adler, dst, src, len);
        } else {
-            return sub32(adler, src, len);
+            return adler32_ssse3(adler, src, len);
        }
    }

--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
@ -8,10 +8,9 @@

 #ifdef X86_AVX512

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
@ -33,13 +32,7 @@ rem_peel:
            _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
        }

-#ifdef X86_AVX2
        return adler32_avx2(adler, src, len);
-#elif defined(X86_SSSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
    }

    __m512i vbuf, vs1_0, vs3;
--- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
@ -9,11 +9,10 @@

 #ifdef X86_AVX512VNNI

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
 #include <immintrin.h>
-#include "../../adler32_fold.h"
 #include "x86_intrins.h"
 #include "adler32_avx512_p.h"
 #include "adler32_avx2_p.h"
@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size

 rem_peel:
    if (len < 32)
-#if defined(X86_SSSE3)
        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif

    if (len < 64)
-#ifdef X86_AVX2
        return adler32_avx2(adler, src, len);
-#elif defined(X86_SSE3)
-        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif

    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@ -135,11 +124,7 @@ rem_peel_copy:
        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);

-#if defined(X86_SSSE3)
        return adler32_ssse3(adler, src, len);
-#else
-        return adler32_len_16(adler0, src, len, adler1);
-#endif
    }

    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
--- a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
@ -6,9 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../adler32_fold.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"
 #include <immintrin.h>

--- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
@ -6,8 +6,8 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-#include "../../adler32_p.h"
+#include "zbuild.h"
+#include "adler32_p.h"
 #include "adler32_ssse3_p.h"

 #ifdef X86_SSSE3
--- a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
@ -4,10 +4,7 @@

 #include "zbuild.h"

-/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
- * code size by sharing the chunkcopy functions, which will certainly compile
- * to identical machine code */
-#if defined(X86_SSSE3) && defined(X86_SSE2)
+#if defined(X86_SSSE3)
 #include <immintrin.h>
 #include "../generic/chunk_permute_table.h"

@ -19,8 +16,6 @@ typedef __m128i chunk_t;
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 #define HAVE_CHUNK_MAG
-#define HAVE_CHUNKCOPY
-#define HAVE_CHUNKUNROLL

 static const lut_rem_pair perm_idx_lut[13] = {
    {0, 1},      /* 3 */
@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
    return ret_vec;
 }

-extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-
 #define CHUNKSIZE        chunksize_ssse3
 #define CHUNKMEMSET      chunkmemset_ssse3
 #define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
-#define CHUNKCOPY        chunkcopy_sse2
-#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKCOPY        chunkcopy_ssse3
+#define CHUNKUNROLL      chunkunroll_ssse3

 #include "chunkset_tpl.h"

--- a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
--- a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
@ -3,8 +3,9 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
-
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
 #include "fallback_builtins.h"

 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
--- a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
    __m128i xmm_crc_part = _mm_setzero_si128();
-#ifdef COPY
    char ALIGNED_(16) partial_buf[16] = { 0 };
-#else
+#ifndef COPY
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
    int32_t first = init_crc != 0;

-    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
-     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
-     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
-     * by definition can be up to 15 bytes + one full vector load. */
-    assert(len >= 31 || first == 0);
+    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
+     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
+     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
+     * up to 15 bytes + one full vector load. */
+    assert(len >= 16 || first == 0);
 #endif
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

    if (len < 16) {
-#ifdef COPY
        if (len == 0)
            return;

        memcpy(partial_buf, src, len);
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+#ifdef COPY
        memcpy(dst, partial_buf, len);
 #endif
        goto partial;
@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint

        if (algn_diff < 4 && init_crc != 0) {
            xmm_t0 = xmm_crc_part;
-            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
-            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
-            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            if (len >= 32) {
+                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            } else {
+                memcpy(partial_buf, src + 16, len - 16);
+                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
+                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+                src += 16;
+                len -= 16;
+#ifdef COPY
+                dst -= algn_diff;
+#endif
+                goto partial;
+            }
+
            src += 16;
            len -= 16;
        }
--- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@ -17,7 +17,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
+#include "zbuild.h"

 #include <immintrin.h>
 #include <wmmintrin.h>
@ -26,8 +26,9 @@
 #  include <immintrin.h>
 #endif

-#include "../../crc32_fold.h"
-#include "../../crc32_braid_p.h"
+#include "crc32.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
 #include "x86_intrins.h"
 #include <assert.h>

@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
    return crc->value;
 }

+static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c = (~crc) & 0xffffffff;
+
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    return c ^ 0xffffffff;
+}
+
 Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
-    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
-     * these short lengths might also prove to be effective */
-    if (len < 64)
-        return PREFIX(crc32_braid)(crc32, buf, len);
+    /* For lens smaller than ~12, crc32_small method is faster.
+     * But there are also minimum requirements for the pclmul functions due to alignment */
+    if (len < 16)
+        return crc32_small(crc32, buf, len);

    crc32_fold ALIGNED_(16) crc_state;
    CRC32_FOLD_RESET(&crc_state);
--- a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
+++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@ -3,7 +3,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+#ifdef X86_VPCLMULQDQ_CRC

 #define X86_VPCLMULQDQ
 #define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
--- a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
+++ b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
@ -1,24 +0,0 @@
-/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
- *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- */
-
-#ifdef X86_SSE42
-#include "../../zbuild.h"
-#include <nmmintrin.h>
-#include "../../deflate.h"
-
-#define HASH_CALC(s, h, val)\
-    h = _mm_crc32_u32(h, val)
-
-#define HASH_CALC_VAR       h
-#define HASH_CALC_VAR_INIT  uint32_t h = 0
-
-#define UPDATE_HASH         update_hash_sse42
-#define INSERT_STRING       insert_string_sse42
-#define QUICK_INSERT_STRING quick_insert_string_sse42
-
-#include "../../insert_string_tpl.h"
-#endif
--- a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
@ -9,8 +9,8 @@
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 #include <immintrin.h>

--- a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
@ -8,8 +8,8 @@
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "../../zbuild.h"
-#include "../../deflate.h"
+#include "zbuild.h"
+#include "deflate.h"

 #include <immintrin.h>
 #include <assert.h>
--- a/3rdparty/zlib-ng/arch/x86/x86_features.c
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.c
@ -7,7 +7,7 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "../../zbuild.h"
+#include "zbuild.h"
 #include "x86_features.h"

 #ifdef _MSC_VER
@ -15,6 +15,13 @@
 #else
 // Newer versions of GCC and clang come with cpuid.h
 #  include <cpuid.h>
+#  ifdef X86_HAVE_XSAVE_INTRIN
+#    if __GNUC__ == 8
+#      include <xsaveintrin.h>
+#    else
+#      include <immintrin.h>
+#    endif
+#  endif
 #endif

 #include <string.h>
@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
    *ecx = registers[2];
    *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
    __cpuid(info, *eax, *ebx, *ecx, *edx);
 #endif
 }
@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
    *ecx = registers[2];
    *edx = registers[3];
 #else
+    *eax = *ebx = *ecx = *edx = 0;
    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
 #endif
 }

 static inline uint64_t xgetbv(unsigned int xcr) {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
    return _xgetbv(xcr);
 #else
    uint32_t eax, edx;
@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {

        // check AVX512 bits if the OS supports saving ZMM registers
        if (features->has_os_save_zmm) {
-            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512f = ebx & 0x00010000;
+            if (features->has_avx512f) {
+                // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+                // AVX512(DQ,BW,VL).
+                features->has_avx512dq = ebx & 0x00020000;
+                features->has_avx512bw = ebx & 0x40000000;
+                features->has_avx512vl = ebx & 0x80000000;
+            }
+            features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+              && features->has_avx512vl;
            features->has_avx512vnni = ecx & 0x800;
        }
    }
--- a/3rdparty/zlib-ng/arch/x86/x86_features.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.h
@ -1,14 +1,18 @@
 /* x86_features.h -- check for CPU features
-* Copyright (C) 2013 Intel Corporation Jim Kukunas
-* For conditions of distribution and use, see copyright notice in zlib.h
-*/
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */

 #ifndef X86_FEATURES_H_
 #define X86_FEATURES_H_

 struct x86_cpu_features {
    int has_avx2;
-    int has_avx512;
+    int has_avx512f;
+    int has_avx512dq;
+    int has_avx512bw;
+    int has_avx512vl;
+    int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
    int has_avx512vnni;
    int has_sse2;
    int has_ssse3;
@ -21,4 +25,4 @@ struct x86_cpu_features {

 void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);

-#endif /* CPU_H_ */
+#endif /* X86_FEATURES_H_ */
--- a/3rdparty/zlib-ng/arch/x86/x86_functions.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_functions.h
@ -0,0 +1,172 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#ifdef X86_SSE2
+uint32_t chunksize_sse2(void);
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+    void slide_hash_sse2(deflate_state *s);
+#  endif
+    void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint32_t chunksize_avx2(void);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+    void slide_hash_avx2(deflate_state *s);
+#  endif
+    void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_CRC
+uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+#  if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_sse2
+#    undef native_chunksize
+#    define native_chunksize chunksize_sse2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_sse2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_sse2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_sse2
+#      undef native_longest_match
+#      define native_longest_match longest_match_sse2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_sse2
+#    endif
+#endif
+// X86 - SSSE3
+#  if defined(X86_SSSE3) && defined(__SSSE3__)
+#    undef native_adler32
+#    define native_adler32 adler32_ssse3
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_ssse3
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_ssse3
+#  endif
+// X86 - SSE4.2
+#  if defined(X86_SSE42) && defined(__SSE4_2__)
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_sse42
+#  endif
+
+// X86 - PCLMUL
+#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
+#  undef native_crc32
+#  define native_crc32 crc32_pclmulqdq
+#  undef native_crc32_fold
+#  define native_crc32_fold crc32_fold_pclmulqdq
+#  undef native_crc32_fold_copy
+#  define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
+#  undef native_crc32_fold_final
+#  define native_crc32_fold_final crc32_fold_pclmulqdq_final
+#  undef native_crc32_fold_reset
+#  define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
+#endif
+// X86 - AVX
+#  if defined(X86_AVX2) && defined(__AVX2__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx2
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx2
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx2
+#    undef native_chunksize
+#    define native_chunksize chunksize_avx2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_avx2
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_avx2
+#      undef native_longest_match
+#      define native_longest_match longest_match_avx2
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_avx2
+#    endif
+#  endif
+
+// X86 - AVX512 (F,DQ,BW,Vl)
+#  if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#    undef native_adler32
+#    define native_adler32 adler32_avx512
+#    undef native_adler32_fold_copy
+#    define native_adler32_fold_copy adler32_fold_copy_avx512
+// X86 - AVX512 (VNNI)
+#    if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
+#      undef native_adler32
+#      define native_adler32 adler32_avx512_vnni
+#      undef native_adler32_fold_copy
+#      define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
+#    endif
+// X86 - VPCLMULQDQ
+#    if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
+#      undef native_crc32
+#      define native_crc32 crc32_vpclmulqdq
+#      undef native_crc32_fold
+#      define native_crc32_fold crc32_fold_vpclmulqdq
+#      undef native_crc32_fold_copy
+#      define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
+#      undef native_crc32_fold_final
+#      define native_crc32_fold_final crc32_fold_vpclmulqdq_final
+#      undef native_crc32_fold_reset
+#      define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
+#    endif
+#  endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
--- a/3rdparty/zlib-ng/arch/x86/x86_intrins.h
+++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
@ -7,7 +7,7 @@
 #ifdef __AVX2__
 #include <immintrin.h>

-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
    || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
 static inline __m256i _mm256_zextsi128_si256(__m128i a) {
    __m128i r;
@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 /* GCC <9 is missing some AVX512 intrinsics.
 */
 #ifdef __AVX512F__
-#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
 #include <immintrin.h>

 #define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
--- a/3rdparty/zlib-ng/arch_functions.h
+++ b/3rdparty/zlib-ng/arch_functions.h
@ -0,0 +1,29 @@
+/* arch_functions.h -- Arch-specific function prototypes.
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FUNCTIONS_H_
+#define CPU_FUNCTIONS_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "crc32.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#include "arch/generic/generic_functions.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_functions.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_functions.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_functions.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_functions.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_functions.h"
+#endif
+
+#endif
--- a/3rdparty/zlib-ng/chunkset_tpl.h
+++ b/3rdparty/zlib-ng/chunkset_tpl.h
@ -5,7 +5,7 @@
 #include "zbuild.h"
 #include <stdlib.h>

-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
 extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
 #endif

@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
   without iteration, which will hopefully make the branch prediction more
   reliable. */
 #ifndef HAVE_CHUNKCOPY
-Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    Assert(len > 0, "chunkcopy should never have a length 0");
    chunk_t chunk;
    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
   least 258 bytes of output space available (258 being the maximum length
   output from a single token; see inflate_fast()'s assumptions below). */
 #ifndef HAVE_CHUNKUNROLL
-Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
    unsigned char const *from = out - *dist;
    chunk_t chunk;
    while (*dist < *len && *dist < sizeof(chunk_t)) {
@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
    Assert(dist > 0, "chunkmemset cannot have a distance 0");
    /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
    if (len <= 16) {
        return chunkmemset_ssse3(out, dist, len);
    }
--- a/3rdparty/zlib-ng/cmake/detect-arch.c
+++ b/3rdparty/zlib-ng/cmake/detect-arch.c
@ -0,0 +1,115 @@
+// archdetect.c -- Detect compiler architecture and raise preprocessor error
+//                 containing a simple arch identifier.
+// Copyright (C) 2019 Hans Kristian Rosbach
+// Licensed under the Zlib license, see LICENSE.md for details
+
+// x86_64
+#if defined(__x86_64__) || defined(_M_X64)
+    #error archfound x86_64
+
+// x86
+#elif defined(__i386) || defined(_M_IX86)
+    #error archfound i686
+
+// ARM
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #error archfound aarch64
+#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
+        #error archfound armv8
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+        #error archfound armv7
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
+        #error archfound armv6
+    #elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+        #error archfound armv5
+    #elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
+        #error archfound armv4
+    #elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
+        #error archfound armv3
+    #elif defined(__ARM_ARCH_2__)
+        #error archfound armv2
+    #endif
+
+// PowerPC
+#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
+    #if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #error archfound powerpc64le
+        #else
+            #error archfound powerpc64
+        #endif
+    #else
+        #error archfound powerpc
+    #endif
+
+// --------------- Less common architectures alphabetically below ---------------
+
+// ALPHA
+#elif defined(__alpha__) || defined(__alpha)
+    #error archfound alpha
+
+// Blackfin
+#elif defined(__BFIN__)
+    #error archfound blackfin
+
+// Itanium
+#elif defined(__ia64) || defined(_M_IA64)
+    #error archfound ia64
+
+// MIPS
+#elif defined(__mips__) || defined(__mips)
+    #error archfound mips
+
+// Motorola 68000-series
+#elif defined(__m68k__)
+    #error archfound m68k
+
+// SuperH
+#elif defined(__sh__)
+    #error archfound sh
+
+// SPARC
+#elif defined(__sparc__) || defined(__sparc)
+    #if defined(__sparcv9) || defined(__sparc_v9__)
+        #error archfound sparc9
+    #elif defined(__sparcv8) || defined(__sparc_v8__)
+        #error archfound sparc8
+    #endif
+
+// SystemZ
+#elif defined(__370__)
+    #error archfound s370
+#elif defined(__s390__)
+    #error archfound s390
+#elif defined(__s390x) || defined(__zarch__)
+    #error archfound s390x
+
+// PARISC
+#elif defined(__hppa__)
+    #error archfound parisc
+
+// RS-6000
+#elif defined(__THW_RS6000)
+    #error archfound rs6000
+
+// RISC-V
+#elif defined(__riscv)
+    #if __riscv_xlen == 64
+        #error archfound riscv64
+    #elif __riscv_xlen == 32
+        #error archfound riscv32
+    #endif
+
+// LOONGARCH
+#elif defined(__loongarch_lp64)
+    #error archfound loongarch64
+
+// Emscripten (WebAssembly)
+#elif defined(__EMSCRIPTEN__)
+    #error archfound wasm32
+
+// return 'unrecognized' if we do not know what architecture this is
+#else
+    #error archfound unrecognized
+#endif
--- a/3rdparty/zlib-ng/cmake/detect-arch.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-arch.cmake
@ -0,0 +1,104 @@
+# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
+# Copyright (C) 2019 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+set(ARCHDETECT_FOUND TRUE)
+
+if(CMAKE_OSX_ARCHITECTURES)
+    # If multiple architectures are requested (universal build), pick only the first
+    list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
+elseif(MSVC)
+    if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
+        set(ARCH "i686")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
+        set(ARCH "x86_64")
+    elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
+        set(ARCH "arm")
+    elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64EC")
+        set(ARCH "aarch64")
+    endif()
+elseif(EMSCRIPTEN)
+    set(ARCH "wasm32")
+elseif(CMAKE_CROSSCOMPILING)
+    set(ARCH ${CMAKE_C_COMPILER_TARGET})
+else()
+    # Let preprocessor parse archdetect.c and raise an error containing the arch identifier
+    enable_language(C)
+    try_run(
+        run_result_unused
+        compile_result_unused
+        ${CMAKE_CURRENT_BINARY_DIR}
+        ${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
+        COMPILE_OUTPUT_VARIABLE RAWOUTPUT
+        CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+    )
+
+    # Find basearch tag, and extract the arch word into BASEARCH variable
+    string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
+    if(NOT ARCH)
+        set(ARCH unknown)
+    endif()
+endif()
+
+# Make sure we have ARCH set
+if(NOT ARCH OR ARCH STREQUAL "unknown")
+    set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
+    message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
+else()
+    message(STATUS "Arch detected: '${ARCH}'")
+endif()
+
+# Base arch detection
+if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64|cortex)")
+    set(BASEARCH "arm")
+    set(BASEARCH_ARM_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
+    set(BASEARCH "ppc")
+    set(BASEARCH_PPC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "alpha")
+    set(BASEARCH "alpha")
+    set(BASEARCH_ALPHA_FOUND TRUE)
+elseif("${ARCH}" MATCHES "blackfin")
+    set(BASEARCH "blackfin")
+    set(BASEARCH_BLACKFIN_FOUND TRUE)
+elseif("${ARCH}" MATCHES "ia64")
+    set(BASEARCH "ia64")
+    set(BASEARCH_IA64_FOUND TRUE)
+elseif("${ARCH}" MATCHES "mips")
+    set(BASEARCH "mips")
+    set(BASEARCH_MIPS_FOUND TRUE)
+elseif("${ARCH}" MATCHES "m68k")
+    set(BASEARCH "m68k")
+    set(BASEARCH_M68K_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sh")
+    set(BASEARCH "sh")
+    set(BASEARCH_SH_FOUND TRUE)
+elseif("${ARCH}" MATCHES "sparc[89]?")
+    set(BASEARCH "sparc")
+    set(BASEARCH_SPARC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "s3[679]0x?")
+    set(BASEARCH "s360")
+    set(BASEARCH_S360_FOUND TRUE)
+elseif("${ARCH}" MATCHES "parisc")
+    set(BASEARCH "parisc")
+    set(BASEARCH_PARISC_FOUND TRUE)
+elseif("${ARCH}" MATCHES "rs6000")
+    set(BASEARCH "rs6000")
+    set(BASEARCH_RS6000_FOUND TRUE)
+elseif("${ARCH}" MATCHES "riscv(32|64)")
+    set(BASEARCH "riscv")
+    set(BASEARCH_RISCV_FOUND TRUE)
+elseif("${ARCH}" MATCHES "loongarch64")
+    set(BASEARCH "loongarch")
+    set(BASEARCH_LOONGARCH_FOUND TRUE)
+elseif("${ARCH}" MATCHES "wasm32")
+    set(BASEARCH "wasm32")
+    set(BASEARCH_WASM32_FOUND TRUE)
+else()
+    set(BASEARCH "x86")
+    set(BASEARCH_X86_FOUND TRUE)
+    message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
+endif()
+message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")
--- a/3rdparty/zlib-ng/cmake/detect-coverage.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-coverage.cmake
@ -0,0 +1,46 @@
+# detect-coverage.cmake -- Detect supported compiler coverage flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_code_coverage)
+    # Check for -coverage flag support for Clang/GCC
+    if(CMAKE_VERSION VERSION_LESS 3.14)
+        set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+    else()
+        set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
+    endif()
+    check_c_compiler_flag(-coverage HAVE_COVERAGE)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+    if(HAVE_COVERAGE)
+        add_compile_options(-coverage)
+        add_link_options(-coverage)
+        message(STATUS "Code coverage enabled using: -coverage")
+    else()
+        # Some versions of GCC don't support -coverage shorthand
+        if(CMAKE_VERSION VERSION_LESS 3.14)
+            set(CMAKE_REQUIRED_LIBRARIES -lgcov)
+        else()
+            set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
+        endif()
+        check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
+        set(CMAKE_REQUIRED_LIBRARIES)
+        set(CMAKE_REQUIRED_LINK_OPTIONS)
+
+        if(HAVE_TEST_COVERAGE)
+            add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
+            add_link_options(-lgcov -fprofile-arcs)
+            message(STATUS "Code coverage enabled using: -ftest-coverage")
+        else()
+            message(WARNING "Compiler does not support code coverage")
+            set(WITH_CODE_COVERAGE OFF)
+        endif()
+    endif()
+
+    # Set optimization level to zero for code coverage builds
+    if (WITH_CODE_COVERAGE)
+        # Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
+        set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
+    endif()
+endmacro()
--- a/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake
@ -0,0 +1,43 @@
+# detect-install-dirs.cmake -- Detect install directory parameters
+# Copyright (C) 2021 Hans Kristian Rosbach
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# Determine installation directory for executables
+if (DEFINED BIN_INSTALL_DIR)
+    set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
+elseif (DEFINED INSTALL_BIN_DIR)
+    set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
+endif()
+
+# Determine installation directory for libraries
+if (DEFINED LIB_INSTALL_DIR)
+    set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
+elseif (DEFINED INSTALL_LIB_DIR)
+    set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
+endif()
+
+# Determine installation directory for include files
+if (DEFINED INC_INSTALL_DIR)
+    set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
+elseif (DEFINED INSTALL_INC_DIR)
+    set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
+endif()
+
+# Define GNU standard installation directories
+include(GNUInstallDirs)
+
+# Determine installation directory for pkgconfig files
+if (DEFINED PKGCONFIG_INSTALL_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED INSTALL_PKGCONFIG_DIR)
+    set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
+else()
+    set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
+endif()
--- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
@ -2,40 +2,39 @@
 # Licensed under the Zlib license, see LICENSE.md for details

 macro(check_acle_compiler_flag)
-    if(MSVC)
-        # Both ARM and ARM64-targeting msvc support intrinsics, but
-        # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
-        if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
-            set(HAVE_ACLE_FLAG TRUE)
-        endif()
-    else()
+    if(NOT NATIVEFLAG)
        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-            if(NOT NATIVEFLAG)
+            check_c_compiler_flag("-march=armv8-a+crc" HAVE_MARCH_ARMV8_CRC)
+            if(HAVE_MARCH_ARMV8_CRC)
                set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
+            else()
+                check_c_compiler_flag("-march=armv8-a+crc+simd" HAVE_MARCH_ARMV8_CRC_SIMD)
+                if(HAVE_MARCH_ARMV8_CRC_SIMD)
+                    set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support")
+                endif()
            endif()
        endif()
-        # Check whether compiler supports ACLE flag
-        set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
-        check_c_source_compiles(
-            "int main() { return 0; }"
-            HAVE_ACLE_FLAG FAIL_REGEX "not supported")
-        if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
-            set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
-            # Check whether compiler supports ACLE flag
-            set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
-            check_c_source_compiles(
-                "int main() { return 0; }"
-                HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
-            set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
-            unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
-        endif()
-        set(CMAKE_REQUIRED_FLAGS)
    endif()
+    # Check whether compiler supports ARMv8 CRC intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_MSC_VER)
+        #include <intrin.h>
+        #else
+        #include <arm_acle.h>
+        #endif
+        unsigned int f(unsigned int a, unsigned int b) {
+            return __crc32w(a, b);
+        }
+        int main(void) { return 0; }"
+        HAVE_ACLE_FLAG
+    )
+    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_armv6_compiler_flag)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
            if(HAVE_MARCH_ARMV6)
                set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
@ -67,21 +66,21 @@ macro(check_armv6_compiler_flag)
            return __uqsub16(a, b);
        #endif
        }
-        int main(void) { return 0; }"
+        int main(void) { return f(1,2); }"
        HAVE_ARMV6_INTRIN
    )
    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_avx512_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
-        else()
-            set(AVX512FLAG "/arch:AVX512")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            else()
+                set(AVX512FLAG "/arch:AVX512")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
            # instruction scheduling unless you specify a reasonable -mtune= target
            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
@ -94,9 +93,9 @@ macro(check_avx512_intrinsics)
                endif()
                unset(HAVE_CASCADE_LAKE)
            endif()
+        elseif(MSVC)
+            set(AVX512FLAG "/arch:AVX512")
        endif()
-    elseif(MSVC)
-        set(AVX512FLAG "/arch:AVX512")
    endif()
    # Check whether compiler supports AVX512 intrinsics
    set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
@ -109,26 +108,17 @@ macro(check_avx512_intrinsics)
        int main(void) { return 0; }"
        HAVE_AVX512_INTRIN
    )
-
-    # Evidently both GCC and clang were late to implementing these
-    check_c_source_compiles(
-        "#include <immintrin.h>
-        __mmask16 f(__mmask16 x) { return _knot_mask16(x); }
-        int main(void) { return 0; }"
-        HAVE_MASK_INTRIN
-    )
-    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_avx512vnni_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
-        else()
-            set(AVX512VNNIFLAG "/arch:AVX512")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
+                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            else()
+                set(AVX512VNNIFLAG "/arch:AVX512")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
            if(NOT MSVC)
                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
@ -139,11 +129,10 @@ macro(check_avx512vnni_intrinsics)
                endif()
                unset(HAVE_CASCADE_LAKE)
            endif()
+        elseif(MSVC)
+            set(AVX512VNNIFLAG "/arch:AVX512")
        endif()
-    elseif(MSVC)
-        set(AVX512VNNIFLAG "/arch:AVX512")
    endif()
-
    # Check whether compiler supports AVX512vnni intrinsics
    set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
    check_c_source_compiles(
@ -159,18 +148,18 @@ macro(check_avx512vnni_intrinsics)
 endmacro()

 macro(check_avx2_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(AVX2FLAG "-mavx2")
+            else()
+                set(AVX2FLAG "/arch:AVX2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(AVX2FLAG "-mavx2")
-        else()
+        elseif(MSVC)
            set(AVX2FLAG "/arch:AVX2")
        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
-            set(AVX2FLAG "-mavx2")
-        endif()
-    elseif(MSVC)
-        set(AVX2FLAG "/arch:AVX2")
    endif()
    # Check whether compiler supports AVX2 intrinics
    set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
@ -187,8 +176,8 @@ macro(check_avx2_intrinsics)
 endmacro()

 macro(check_neon_compiler_flag)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            if("${ARCH}" MATCHES "aarch64")
                set(NEONFLAG "-march=armv8-a+simd")
            else()
@ -206,12 +195,52 @@ macro(check_neon_compiler_flag)
        #endif
        int main() { return 0; }"
        NEON_AVAILABLE FAIL_REGEX "not supported")
+    # Check whether compiler native flag is enough for NEON support
+    # Some GCC versions don't enable FPU (vector unit) when using -march=native
+    if(NEON_AVAILABLE AND NATIVEFLAG AND (NOT "${ARCH}" MATCHES "aarch64"))
+        check_c_source_compiles(
+            "#include <arm_neon.h>
+            uint8x16_t f(uint8x16_t x, uint8x16_t y) {
+                return vaddq_u8(x, y);
+            }
+            int main(int argc, char* argv[]) {
+                uint8x16_t a = vdupq_n_u8(argc);
+                uint8x16_t b = vdupq_n_u8(argc);
+                uint8x16_t result = f(a, b);
+                return result[0];
+            }"
+            ARM_NEON_SUPPORT_NATIVE
+        )
+        if(NOT ARM_NEON_SUPPORT_NATIVE)
+            set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG} -mfpu=neon ${ZNOLTOFLAG}")
+            check_c_source_compiles(
+                "#include <arm_neon.h>
+                uint8x16_t f(uint8x16_t x, uint8x16_t y) {
+                    return vaddq_u8(x, y);
+                }
+                int main(int argc, char* argv[]) {
+                    uint8x16_t a = vdupq_n_u8(argc);
+                    uint8x16_t b = vdupq_n_u8(argc);
+                    uint8x16_t result = f(a, b);
+                    return result[0];
+                }"
+                ARM_NEON_SUPPORT_NATIVE_MFPU
+            )
+            if(ARM_NEON_SUPPORT_NATIVE_MFPU)
+                set(NEONFLAG "-mfpu=neon")
+            else()
+                # Remove local NEON_AVAILABLE variable and overwrite the cache
+                unset(NEON_AVAILABLE)
+                set(NEON_AVAILABLE "" CACHE INTERNAL "NEON support available" FORCE)
+            endif()
+        endif()
+    endif()
    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_neon_ld4_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            if("${ARCH}" MATCHES "aarch64")
                set(NEONFLAG "-march=armv8-a+simd")
            else()
@ -234,8 +263,8 @@ macro(check_neon_ld4_intrinsics)
 endmacro()

 macro(check_pclmulqdq_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
            set(PCLMULFLAG "-mpclmul")
        endif()
    endif()
@ -257,8 +286,8 @@ macro(check_pclmulqdq_intrinsics)
 endmacro()

 macro(check_vpclmulqdq_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
            set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
        endif()
    endif()
@ -341,8 +370,8 @@ macro(check_ppc_intrinsics)
 endmacro()

 macro(check_power8_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(POWER8FLAG "-mcpu=power8")
        endif()
    endif()
@ -364,12 +393,27 @@ macro(check_power8_intrinsics)
        }"
        HAVE_POWER8_INTRIN
    )
+    if(NOT HAVE_POWER8_INTRIN AND HAVE_LINUX_AUXVEC_H)
+        check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <linux/auxvec.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
+            }"
+            HAVE_POWER8_INTRIN2
+        )
+        if(HAVE_POWER8_INTRIN2)
+            set(POWER8_NEED_AUXVEC_H 1)
+            set(HAVE_POWER8_INTRIN ${HAVE_POWER8_INTRIN2} CACHE INTERNAL "Have POWER8 intrinsics" FORCE)
+            unset(HAVE_POWER8_INTRIN2 CACHE)
+        endif()
+    endif()
    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_rvv_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(RISCVFLAG "-march=rv64gcv")
        endif()
    endif()
@ -399,8 +443,8 @@ macro(check_s390_intrinsics)
 endmacro()

 macro(check_power9_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(POWER9FLAG "-mcpu=power9")
        endif()
    endif()
@ -422,22 +466,37 @@ macro(check_power9_intrinsics)
        }"
        HAVE_POWER9_INTRIN
    )
+    if(NOT HAVE_POWER9_INTRIN AND HAVE_LINUX_AUXVEC_H)
+        check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <linux/auxvec.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
+            }"
+            HAVE_POWER9_INTRIN2
+        )
+        if(HAVE_POWER9_INTRIN2)
+            set(POWER9_NEED_AUXVEC_H 1)
+            set(HAVE_POWER9_INTRIN ${HAVE_POWER9_INTRIN2} CACHE INTERNAL "Have POWER9 intrinsics" FORCE)
+            unset(HAVE_POWER9_INTRIN2 CACHE)
+        endif()
+    endif()
    set(CMAKE_REQUIRED_FLAGS)
 endmacro()

 macro(check_sse2_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSE2FLAG "-msse2")
-        else()
-            set(SSE2FLAG "/arch:SSE2")
-        endif()
-    elseif(MSVC)
-        if(NOT "${ARCH}" MATCHES "x86_64")
-            set(SSE2FLAG "/arch:SSE2")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE2FLAG "-msse2")
+            else()
+                set(SSE2FLAG "/arch:SSE2")
+            endif()
+        elseif(MSVC)
+            if(NOT "${ARCH}" MATCHES "x86_64")
+                set(SSE2FLAG "/arch:SSE2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(SSE2FLAG "-msse2")
        endif()
    endif()
@ -453,14 +512,14 @@ macro(check_sse2_intrinsics)
 endmacro()

 macro(check_ssse3_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSSE3FLAG "-mssse3")
-        else()
-            set(SSSE3FLAG "/arch:SSSE3")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSSE3FLAG "-mssse3")
+            else()
+                set(SSSE3FLAG "/arch:SSSE3")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(SSSE3FLAG "-mssse3")
        endif()
    endif()
@ -478,14 +537,14 @@ macro(check_ssse3_intrinsics)
 endmacro()

 macro(check_sse42_intrinsics)
-    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-        if(CMAKE_HOST_UNIX OR APPLE)
-            set(SSE42FLAG "-msse4.2")
-        else()
-            set(SSE42FLAG "/arch:SSE4.2")
-        endif()
-    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
-        if(NOT NATIVEFLAG)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE42FLAG "-msse4.2")
+            else()
+                set(SSE42FLAG "/arch:SSE4.2")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(SSE42FLAG "-msse4.2")
        endif()
    endif()
@ -526,15 +585,17 @@ macro(check_vgfma_intrinsics)
 endmacro()

 macro(check_xsave_intrinsics)
-    if(NOT NATIVEFLAG AND NOT MSVC)
+    if(NOT NATIVEFLAG AND NOT MSVC AND NOT CMAKE_C_COMPILER_ID MATCHES "Intel")
        set(XSAVEFLAG "-mxsave")
    endif()
    set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
    check_c_source_compiles(
        "#ifdef _MSC_VER
        #  include <intrin.h>
+        #elif __GNUC__ == 8 && __GNUC_MINOR__ > 1
+        #  include <xsaveintrin.h>
        #else
-        #  include <x86gprintrin.h>
+        #  include <immintrin.h>
        #endif
        unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
        int main(void) { return 0; }"
--- a/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake
+++ b/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake
@ -0,0 +1,166 @@
+# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(add_common_sanitizer_flags)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        add_compile_options(-g3)
+    endif()
+    check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
+    if(HAVE_NO_OMIT_FRAME_POINTER)
+        add_compile_options(-fno-omit-frame-pointer)
+        add_link_options(-fno-omit-frame-pointer)
+    endif()
+    check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+    if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
+        add_compile_options(-fno-optimize-sibling-calls)
+        add_link_options(-fno-optimize-sibling-calls)
+    endif()
+endmacro()
+
+macro(check_sanitizer_support known_checks supported_checks)
+    set(available_checks "")
+
+    # Build list of supported sanitizer flags by incrementally trying compilation with
+    # known sanitizer checks
+
+    foreach(check ${known_checks})
+        if(available_checks STREQUAL "")
+            set(compile_checks "${check}")
+        else()
+            set(compile_checks "${available_checks},${check}")
+        endif()
+
+        set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
+
+        check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
+            FAIL_REGEX "not supported|unrecognized command|unknown option")
+
+        set(CMAKE_REQUIRED_FLAGS)
+
+        if(HAVE_SANITIZER_${check})
+            set(available_checks ${compile_checks})
+        endif()
+    endforeach()
+
+    set(${supported_checks} ${available_checks})
+endmacro()
+
+macro(add_address_sanitizer)
+    set(known_checks
+        address
+        pointer-compare
+        pointer-subtract
+        )
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Address sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Address sanitizer is not supported")
+    endif()
+
+    if(CMAKE_CROSSCOMPILING_EMULATOR)
+        # Only check for leak sanitizer if not cross-compiling due to qemu crash
+        message(WARNING "Leak sanitizer is not supported when cross compiling")
+    else()
+        # Leak sanitizer requires address sanitizer
+        check_sanitizer_support("leak" supported_checks)
+        if(NOT ${supported_checks} STREQUAL "")
+            message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
+            add_compile_options(-fsanitize=${supported_checks})
+            add_link_options(-fsanitize=${supported_checks})
+            add_common_sanitizer_flags()
+        else()
+            message(STATUS "Leak sanitizer is not supported")
+        endif()
+    endif()
+endmacro()
+
+macro(add_memory_sanitizer)
+    check_sanitizer_support("memory" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+
+        check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
+        if(HAVE_MEMORY_TRACK_ORIGINS)
+            add_compile_options(-fsanitize-memory-track-origins)
+            add_link_options(-fsanitize-memory-track-origins)
+        endif()
+    else()
+        message(STATUS "Memory sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_thread_sanitizer)
+    check_sanitizer_support("thread" supported_checks)
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Thread sanitizer is not supported")
+    endif()
+endmacro()
+
+macro(add_undefined_sanitizer)
+    set(known_checks
+        array-bounds
+        bool
+        bounds
+        builtin
+        enum
+        float-cast-overflow
+        float-divide-by-zero
+        function
+        integer-divide-by-zero
+        local-bounds
+        null
+        nonnull-attribute
+        pointer-overflow
+        return
+        returns-nonnull-attribute
+        shift
+        shift-base
+        shift-exponent
+        signed-integer-overflow
+        undefined
+        unsigned-integer-overflow
+        unsigned-shift-base
+        vla-bound
+        vptr
+        )
+
+    # Only check for alignment sanitizer flag if unaligned access is not supported
+    if(NOT WITH_UNALIGNED)
+        list(APPEND known_checks alignment)
+    endif()
+    # Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
+    if(NOT CMAKE_C_FLAGS MATCHES "-O0")
+        list(APPEND known_checks object-size)
+    endif()
+
+    check_sanitizer_support("${known_checks}" supported_checks)
+
+    if(NOT ${supported_checks} STREQUAL "")
+        message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
+        add_compile_options(-fsanitize=${supported_checks})
+        add_link_options(-fsanitize=${supported_checks})
+
+        # Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
+        # it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
+        if(WITH_UNALIGNED)
+            add_compile_options(-fno-sanitize=alignment)
+        endif()
+
+        add_common_sanitizer_flags()
+    else()
+        message(STATUS "Undefined behavior sanitizer is not supported")
+    endif()
+endmacro()
--- a/3rdparty/zlib-ng/cpu_features.h
+++ b/3rdparty/zlib-ng/cpu_features.h
@ -6,12 +6,10 @@
 #ifndef CPU_FEATURES_H_
 #define CPU_FEATURES_H_

-#include "adler32_fold.h"
-#include "crc32_fold.h"
+#ifndef DISABLE_RUNTIME_CPU_DETECTION

 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
-#  include "fallback_builtins.h"
 #elif defined(ARM_FEATURES)
 #  include "arch/arm/arm_features.h"
 #elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
@ -38,266 +36,8 @@ struct cpu_features {
 #endif
 };

-extern void cpu_check_features(struct cpu_features *features);
+void cpu_check_features(struct cpu_features *features);

-/* adler32 */
-typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
-
-extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
-#ifdef ARM_NEON
-extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef PPC_VMX
-extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_SSSE3
-extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX2
-extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX512
-extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef X86_AVX512VNNI
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-#ifdef POWER8_VSX
-extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
-#endif
-
-/* adler32 folding */
-#ifdef RISCV_RVV
-extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_SSE42
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX2
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX512
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-#ifdef X86_AVX512VNNI
-extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
-
-/* CRC32 folding */
-#ifdef X86_PCLMULQDQ_CRC
-extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
-extern void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
-extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
-#endif
-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
-extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
-extern void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
-extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
-#endif
-
-/* memory chunking */
-extern uint32_t chunksize_c(void);
-extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#ifdef X86_SSE2
-extern uint32_t chunksize_sse2(void);
-extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_SSSE3
-extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_AVX2
-extern uint32_t chunksize_avx2(void);
-extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef ARM_NEON
-extern uint32_t chunksize_neon(void);
-extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef POWER8_VSX
-extern uint32_t chunksize_power8(void);
-extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t chunksize_rvv(void);
-extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-
-#ifdef ZLIB_COMPAT
-typedef struct z_stream_s z_stream;
-#else
-typedef struct zng_stream_s zng_stream;
-#endif
-
-/* inflate fast loop */
-extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
-#ifdef X86_SSE2
-extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef X86_SSSE3
-extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef X86_AVX2
-extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef ARM_NEON
-extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef POWER8_VSX
-extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
-#endif
-#ifdef RISCV_RVV
-extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
-#endif
-
-/* CRC32 */
-typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
-
-extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
-#ifdef ARM_ACLE
-extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
-#elif defined(POWER8_VSX)
-extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
-#elif defined(S390_CRC32_VX)
-extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
-#endif
-
-/* compare256 */
-typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
-
-extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
-#ifdef HAVE_BUILTIN_CTZ
-extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
-#endif
-#ifdef POWER9
-extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
-#endif
-
-#ifdef DEFLATE_H_
-/* insert_string */
-extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
-#ifdef X86_SSE42
-extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
-#elif defined(ARM_ACLE)
-extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
-#endif
-
-/* longest_match */
-extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
-#ifdef HAVE_BUILTIN_CTZ
-extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef POWER9
-extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
-#endif
-
-/* longest_match_slow */
-extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
-extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef POWER9
-extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef RISCV_RVV
-extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
-#endif
-
-/* quick_insert_string */
-extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
-#ifdef X86_SSE42
-extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
-#elif defined(ARM_ACLE)
-extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
-#endif
-
-/* slide_hash */
-typedef void (*slide_hash_func)(deflate_state *s);
-
-#ifdef X86_SSE2
-extern void slide_hash_sse2(deflate_state *s);
-#endif
-#if defined(ARM_SIMD)
-extern void slide_hash_armv6(deflate_state *s);
-#endif
-#if defined(ARM_NEON)
-extern void slide_hash_neon(deflate_state *s);
-#endif
-#if defined(PPC_VMX)
-extern void slide_hash_vmx(deflate_state *s);
-#endif
-#if defined(POWER8_VSX)
-extern void slide_hash_power8(deflate_state *s);
-#endif
-#if defined(RISCV_RVV)
-extern void slide_hash_rvv(deflate_state *s);
-#endif
-#ifdef X86_AVX2
-extern void slide_hash_avx2(deflate_state *s);
-#endif
-
-/* update_hash */
-extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
-#ifdef X86_SSE42
-extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
-#elif defined(ARM_ACLE)
-extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
-#endif
 #endif

 #endif
--- a/3rdparty/zlib-ng/crc32.c
+++ b/3rdparty/zlib-ng/crc32.c
@ -0,0 +1,42 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "crc32_braid_tbl.h"
+
+/* ========================================================================= */
+
+const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
+    return (const uint32_t *)crc_table;
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return (unsigned long)FUNCTABLE_CALL(crc32)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return FUNCTABLE_CALL(crc32)(crc, buf, len);
+}
+#endif
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
+    return PREFIX(crc32_z)(crc, buf, len);
+}
+#endif
--- a/3rdparty/zlib-ng/crc32.h
+++ b/3rdparty/zlib-ng/crc32.h
@ -0,0 +1,16 @@
+/* crc32.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_H_
+#define CRC32_H_
+
+#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
+/* sizeof(__m128i) * (4 folds) */
+
+typedef struct crc32_fold_s {
+    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
+    uint32_t value;
+} crc32_fold;
+
+#endif
--- a/3rdparty/zlib-ng/crc32_braid_comb.c
+++ b/3rdparty/zlib-ng/crc32_braid_comb.c
@ -7,7 +7,6 @@
 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
 */

-#include "zbuild.h"
 #include "zutil.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"
--- a/3rdparty/zlib-ng/crc32_braid_p.h
+++ b/3rdparty/zlib-ng/crc32_braid_p.h
@ -1,7 +1,6 @@
 #ifndef CRC32_BRAID_P_H_
 #define CRC32_BRAID_P_H_

-#include "zbuild.h"
 #include "zendian.h"

 /* Define N */
@ -25,7 +24,7 @@
 #  endif
 #else
 #  ifndef W
-#    if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
+#    if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
 #      define W 8
 #    else
 #      define W 4
@ -42,9 +41,24 @@
 #  endif
 #endif

+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define ZSWAPWORD(word) (word)
+#  define BRAID_TABLE crc_braid_table
+#elif BYTE_ORDER == BIG_ENDIAN
+#  if W == 8
+#    define ZSWAPWORD(word) ZSWAP64(word)
+#  elif W == 4
+#    define ZSWAPWORD(word) ZSWAP32(word)
+#  endif
+#  define BRAID_TABLE crc_braid_big_table
+#else
+#  error "No endian defined"
+#endif
+
+#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
 /* CRC polynomial. */
 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */

-extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
-
 #endif /* CRC32_BRAID_P_H_ */
--- a/3rdparty/zlib-ng/crc32_fold.h
+++ b/3rdparty/zlib-ng/crc32_fold.h
@ -1,21 +0,0 @@
-/* crc32_fold.h -- crc32 folding interface
- * Copyright (C) 2021 Nathan Moinvaziri
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifndef CRC32_FOLD_H_
-#define CRC32_FOLD_H_
-
-#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
-/* sizeof(__m128i) * (4 folds) */
-
-typedef struct crc32_fold_s {
-    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
-    uint32_t value;
-} crc32_fold;
-
-Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
-Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
-Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
-
-#endif
--- a/3rdparty/zlib-ng/deflate.c
+++ b/3rdparty/zlib-ng/deflate.c
@ -1,5 +1,5 @@
 /* deflate.c -- compress data using the deflation algorithm
- * Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

@ -58,7 +58,7 @@
 # undef deflateInit2
 #endif

-const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jean-loup Gailly and Mark Adler ";
+const char PREFIX(deflate_copyright)[] = " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler ";
 /*
  If you use the zlib library in a product, an acknowledgment is welcome
  in the documentation of your product. If for some reason you cannot
@ -71,14 +71,16 @@ const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jea
 */
 #ifdef S390_DFLTCC_DEFLATE
 #  include "arch/s390/dfltcc_deflate.h"
+/* DFLTCC instructions require window to be page-aligned */
+#  define PAD_WINDOW            PAD_4096
+#  define WINDOW_PAD_SIZE       4096
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_4096
 #else
-/* Memory management for the deflate state. Useful for allocating arch-specific extension blocks. */
-#  define ZALLOC_DEFLATE_STATE(strm) ((deflate_state *)ZALLOC(strm, 1, sizeof(deflate_state)))
-#  define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
-#  define ZCOPY_DEFLATE_STATE(dst, src) memcpy(dst, src, sizeof(deflate_state))
-/* Memory management for the window. Useful for allocation the aligned window. */
-#  define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
-#  define TRY_FREE_WINDOW(strm, addr) TRY_FREE(strm, addr)
+#  define PAD_WINDOW            PAD_64
+#  define WINDOW_PAD_SIZE       64
+#  define HINT_ALIGNED_WINDOW   HINT_ALIGNED_64
+/* Adjust the window size for the arch-specific deflate code. */
+#  define DEFLATE_ADJUST_WINDOW_SIZE(n) (n)
 /* Invoked at the beginning of deflateSetDictionary(). Useful for checking arch-specific window data. */
 #  define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
 /* Invoked at the beginning of deflateGetDictionary(). Useful for adjusting arch-specific window data. */
@ -120,10 +122,6 @@ static void lm_set_level         (deflate_state *s, int level);
 static void lm_init              (deflate_state *s);
 Z_INTERNAL unsigned read_buf  (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);

-extern uint32_t update_hash_roll        (deflate_state *const s, uint32_t h, uint32_t val);
-extern void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
-extern Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
-
 /* ===========================================================================
 * Local data
 */
@ -185,17 +183,111 @@ static const config configuration_table[10] = {
    memset((unsigned char *)s->head, 0, HASH_SIZE * sizeof(*s->head)); \
  } while (0)

-/* ========================================================================= */
-/* This function is hidden in ZLIB_COMPAT builds. */
+
+#ifdef DEF_ALLOC_DEBUG
+#  include <stdio.h>
+#  define LOGSZ(name,size)           fprintf(stderr, "%s is %d bytes\n", name, size)
+#  define LOGSZP(name,size,loc,pad)  fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad)
+#  define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad)
+#else
+#  define LOGSZ(name,size)
+#  define LOGSZP(name,size,loc,pad)
+#  define LOGSZPL(name,size,loc,pad)
+#endif
+
+/* ===========================================================================
+ * Allocate a big buffer and divide it up into the various buffers deflate needs.
+ * Handles alignment of allocated buffer and alignment of individual buffers.
+ */
+Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits, int lit_bufsize) {
+    int curr_size = 0;
+
+    /* Define sizes */
+    int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2);
+    int prev_size = (1 << windowBits) * sizeof(Pos);
+    int head_size = HASH_SIZE * sizeof(Pos);
+    int pending_size = lit_bufsize * LIT_BUFS;
+    int state_size = sizeof(deflate_state);
+    int alloc_size = sizeof(deflate_allocs);
+
+    /* Calculate relative buffer positions and paddings */
+    LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE));
+    int window_pos = PAD_WINDOW(curr_size);
+    curr_size = window_pos + window_size;
+
+    LOGSZP("prev", prev_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int prev_pos = PAD_64(curr_size);
+    curr_size = prev_pos + prev_size;
+
+    LOGSZP("head", head_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int head_pos = PAD_64(curr_size);
+    curr_size = head_pos + head_size;
+
+    LOGSZP("pending", pending_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int pending_pos = PAD_64(curr_size);
+    curr_size = pending_pos + pending_size;
+
+    LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64));
+    int state_pos = PAD_64(curr_size);
+    curr_size = state_pos + state_size;
+
+    LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16));
+    int alloc_pos = PAD_16(curr_size);
+    curr_size = alloc_pos + alloc_size;
+
+    /* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */
+    int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1));
+
+    /* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */
+    char *original_buf = strm->zalloc(strm->opaque, 1, total_size);
+    if (original_buf == NULL)
+        return NULL;
+
+    char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf));
+    LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE));
+
+    /* Initialize alloc_bufs */
+    deflate_allocs *alloc_bufs  = (struct deflate_allocs_s *)(buff + alloc_pos);
+    alloc_bufs->buf_start = (char *)original_buf;
+    alloc_bufs->zfree = strm->zfree;
+
+    /* Assign buffers */
+    alloc_bufs->window = (unsigned char *)HINT_ALIGNED_WINDOW(buff + window_pos);
+    alloc_bufs->prev = (Pos *)HINT_ALIGNED_64(buff + prev_pos);
+    alloc_bufs->head = (Pos *)HINT_ALIGNED_64(buff + head_pos);
+    alloc_bufs->pending_buf = (unsigned char *)HINT_ALIGNED_64(buff + pending_pos);
+    alloc_bufs->state = (deflate_state *)HINT_ALIGNED_16(buff + state_pos);
+
+    memset((char *)alloc_bufs->prev, 0, prev_size);
+
+    return alloc_bufs;
+}
+
+/* ===========================================================================
+ * Free all allocated deflate buffers
+ */
+static inline void free_deflate(PREFIX3(stream) *strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    if (state->alloc_bufs != NULL) {
+        deflate_allocs *alloc_bufs = state->alloc_bufs;
+        alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start);
+        strm->state = NULL;
+    }
+}
+
+/* ===========================================================================
+ * Initialize deflate state and buffers.
+ * This function is hidden in ZLIB_COMPAT builds.
+ */
 int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits,
                                            int32_t memLevel, int32_t strategy) {
    /* Todo: ignore strm->next_in if we use it as window */
-    uint32_t window_padding = 0;
    deflate_state *s;
    int wrap = 1;

-    /* Force initialization functable, because deflate captures function pointers from functable. */
-    functable.force_init();
+    /* Initialize functable */
+    FUNCTABLE_INIT;

    if (strm == NULL)
        return Z_STREAM_ERROR;
@ -230,9 +322,19 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
    if (windowBits == 8)
        windowBits = 9;  /* until 256-byte window bug fixed */

-    s = ZALLOC_DEFLATE_STATE(strm);
-    if (s == NULL)
+    /* Allocate buffers */
+    int lit_bufsize = 1 << (memLevel + 6);
+    deflate_allocs *alloc_bufs = alloc_deflate(strm, windowBits, lit_bufsize);
+    if (alloc_bufs == NULL)
        return Z_MEM_ERROR;
+
+    s = alloc_bufs->state;
+    s->alloc_bufs = alloc_bufs;
+    s->window = alloc_bufs->window;
+    s->prev = alloc_bufs->prev;
+    s->head = alloc_bufs->head;
+    s->pending_buf = alloc_bufs->pending_buf;
+
    strm->state = (struct internal_state *)s;
    s->strm = strm;
    s->status = INIT_STATE;     /* to pass state test in deflateReset() */
@ -243,18 +345,9 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
    s->w_size = 1 << s->w_bits;
    s->w_mask = s->w_size - 1;

-#ifdef X86_PCLMULQDQ_CRC
-    window_padding = 8;
-#endif
-
-    s->window = (unsigned char *) ZALLOC_WINDOW(strm, s->w_size + window_padding, 2*sizeof(unsigned char));
-    s->prev   = (Pos *)  ZALLOC(strm, s->w_size, sizeof(Pos));
-    memset(s->prev, 0, s->w_size * sizeof(Pos));
-    s->head   = (Pos *)  ZALLOC(strm, HASH_SIZE, sizeof(Pos));
-
    s->high_water = 0;      /* nothing written to s->window yet */

-    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+    s->lit_bufsize = lit_bufsize; /* 16K elements by default */

    /* We overlay pending_buf and sym_buf. This works since the average size
     * for length/distance pairs over any compressed block is assured to be 31
@ -295,7 +388,6 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
     * symbols from which it is being constructed.
     */

-    s->pending_buf = (unsigned char *) ZALLOC(strm, s->lit_bufsize, 4);
    s->pending_buf_size = s->lit_bufsize * 4;

    if (s->window == NULL || s->prev == NULL || s->head == NULL || s->pending_buf == NULL) {
@ -304,8 +396,15 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
        PREFIX(deflateEnd)(strm);
        return Z_MEM_ERROR;
    }
+
+#ifdef LIT_MEM
+    s->d_buf = (uint16_t *)(s->pending_buf + (s->lit_bufsize << 1));
+    s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
+    s->sym_end = s->lit_bufsize - 1;
+#else
    s->sym_buf = s->pending_buf + s->lit_bufsize;
    s->sym_end = (s->lit_bufsize - 1) * 3;
+#endif
    /* We avoid equality with lit_bufsize*3 because of wraparound at 64K
     * on 16 bit machines and because stored blocks are restricted to
     * 64K-1 bytes.
@ -348,7 +447,7 @@ static int deflateStateCheck(PREFIX3(stream) *strm) {
    if (strm == NULL || strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
        return 1;
    s = strm->state;
-    if (s == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
+    if (s == NULL || s->alloc_bufs == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
        return 1;
    return 0;
 }
@ -370,7 +469,7 @@ int32_t Z_EXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const uint8

    /* when using zlib wrappers, compute Adler-32 for provided dictionary */
    if (wrap == 1)
-        strm->adler = functable.adler32(strm->adler, dictionary, dictLength);
+        strm->adler = FUNCTABLE_CALL(adler32)(strm->adler, dictionary, dictLength);
    DEFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
    s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */

@ -457,7 +556,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {

 #ifdef GZIP
    if (s->wrap == 2) {
-        strm->adler = functable.crc32_fold_reset(&s->crc_fold);
+        strm->adler = FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
    } else
 #endif
        strm->adler = ADLER32_INITIAL_VALUE;
@ -506,9 +605,17 @@ int32_t Z_EXPORT PREFIX(deflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32
    if (deflateStateCheck(strm))
        return Z_STREAM_ERROR;
    s = strm->state;
+
+#ifdef LIT_MEM
+    if (bits < 0 || bits > BIT_BUF_SIZE ||
+        (unsigned char *)s->d_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
+        return Z_BUF_ERROR;
+#else
    if (bits < 0 || bits > BIT_BUF_SIZE || bits > (int32_t)(sizeof(value) << 3) ||
        s->sym_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
        return Z_BUF_ERROR;
+#endif
+
    do {
        put = BIT_BUF_SIZE - s->bi_valid;
        put = MIN(put, bits);
@ -555,7 +662,7 @@ int32_t Z_EXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int32_t level, int
    if (s->level != level) {
        if (s->level == 0 && s->matches != 0) {
            if (s->matches == 1) {
-                functable.slide_hash(s);
+                FUNCTABLE_CALL(slide_hash)(s);
            } else {
                CLEAR_HASH(s);
            }
@ -794,7 +901,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
 #ifdef GZIP
    if (s->status == GZIP_STATE) {
        /* gzip header */
-        functable.crc32_fold_reset(&s->crc_fold);
+        FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
        put_byte(s, 31);
        put_byte(s, 139);
        put_byte(s, 8);
@ -911,7 +1018,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
                }
            }
            put_short(s, (uint16_t)strm->adler);
-            functable.crc32_fold_reset(&s->crc_fold);
+            FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
        }
        s->status = BUSY_STATE;

@ -982,7 +1089,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
    /* Write the trailer */
 #ifdef GZIP
    if (s->wrap == 2) {
-        strm->adler = functable.crc32_fold_final(&s->crc_fold);
+        strm->adler = FUNCTABLE_CALL(crc32_fold_final)(&s->crc_fold);

        put_uint32(s, strm->adler);
        put_uint32(s, (uint32_t)strm->total_in);
@ -1007,21 +1114,13 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {

 /* ========================================================================= */
 int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
-    int32_t status;
-
    if (deflateStateCheck(strm))
        return Z_STREAM_ERROR;

-    status = strm->state->status;
-
-    /* Deallocate in reverse order of allocations: */
-    TRY_FREE(strm, strm->state->pending_buf);
-    TRY_FREE(strm, strm->state->head);
-    TRY_FREE(strm, strm->state->prev);
-    TRY_FREE_WINDOW(strm, strm->state->window);
+    int32_t status = strm->state->status;

-    ZFREE_STATE(strm, strm->state);
-    strm->state = NULL;
+    /* Free allocated buffers */
+    free_deflate(strm);

    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
 }
@ -1032,7 +1131,6 @@ int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
 int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) {
    deflate_state *ds;
    deflate_state *ss;
-    uint32_t window_padding = 0;

    if (deflateStateCheck(source) || dest == NULL)
        return Z_STREAM_ERROR;
@ -1041,34 +1139,39 @@ int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou

    memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));

-    ds = ZALLOC_DEFLATE_STATE(dest);
-    if (ds == NULL)
+    deflate_allocs *alloc_bufs = alloc_deflate(dest, ss->w_bits, ss->lit_bufsize);
+    if (alloc_bufs == NULL)
        return Z_MEM_ERROR;
+
+    ds = alloc_bufs->state;
+
    dest->state = (struct internal_state *) ds;
-    ZCOPY_DEFLATE_STATE(ds, ss);
+    memcpy(ds, ss, sizeof(deflate_state));
    ds->strm = dest;

-#ifdef X86_PCLMULQDQ_CRC
-    window_padding = 8;
-#endif
-
-    ds->window = (unsigned char *) ZALLOC_WINDOW(dest, ds->w_size + window_padding, 2*sizeof(unsigned char));
-    ds->prev   = (Pos *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
-    ds->head   = (Pos *)  ZALLOC(dest, HASH_SIZE, sizeof(Pos));
-    ds->pending_buf = (unsigned char *) ZALLOC(dest, ds->lit_bufsize, 4);
+    ds->alloc_bufs = alloc_bufs;
+    ds->window = alloc_bufs->window;
+    ds->prev = alloc_bufs->prev;
+    ds->head = alloc_bufs->head;
+    ds->pending_buf = alloc_bufs->pending_buf;

    if (ds->window == NULL || ds->prev == NULL || ds->head == NULL || ds->pending_buf == NULL) {
        PREFIX(deflateEnd)(dest);
        return Z_MEM_ERROR;
    }

-    memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(unsigned char));
+    memcpy(ds->window, ss->window, DEFLATE_ADJUST_WINDOW_SIZE(ds->w_size * 2 * sizeof(unsigned char)));
    memcpy((void *)ds->prev, (void *)ss->prev, ds->w_size * sizeof(Pos));
    memcpy((void *)ds->head, (void *)ss->head, HASH_SIZE * sizeof(Pos));
-    memcpy(ds->pending_buf, ss->pending_buf, ds->pending_buf_size);
+    memcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS);

    ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+#ifdef LIT_MEM
+    ds->d_buf = (uint16_t *)(ds->pending_buf + (ds->lit_bufsize << 1));
+    ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
+#else
    ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
+#endif

    ds->l_desc.dyn_tree = ds->dyn_ltree;
    ds->d_desc.dyn_tree = ds->dyn_dtree;
@ -1095,10 +1198,10 @@ Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf,
        memcpy(buf, strm->next_in, len);
 #ifdef GZIP
    } else if (strm->state->wrap == 2) {
-        functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
+        FUNCTABLE_CALL(crc32_fold_copy)(&strm->state->crc_fold, buf, strm->next_in, len);
 #endif
    } else if (strm->state->wrap == 1) {
-        strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len);
+        strm->adler = FUNCTABLE_CALL(adler32_fold_copy)(strm->adler, buf, strm->next_in, len);
    } else {
        memcpy(buf, strm->next_in, len);
    }
@ -1125,9 +1228,9 @@ static void lm_set_level(deflate_state *s, int level) {
        s->insert_string = &insert_string_roll;
        s->quick_insert_string = &quick_insert_string_roll;
    } else {
-        s->update_hash = functable.update_hash;
-        s->insert_string = functable.insert_string;
-        s->quick_insert_string = functable.quick_insert_string;
+        s->update_hash = update_hash;
+        s->insert_string = insert_string;
+        s->quick_insert_string = quick_insert_string;
    }

    s->level = level;
@ -1191,7 +1294,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
            s->block_start -= (int)wsize;
            if (s->insert > s->strstart)
                s->insert = s->strstart;
-            functable.slide_hash(s);
+            FUNCTABLE_CALL(slide_hash)(s);
            more += wsize;
        }
        if (s->strm->avail_in == 0)
@ -1217,7 +1320,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
        if (s->lookahead + s->insert >= STD_MIN_MATCH) {
            unsigned int str = s->strstart - s->insert;
            if (UNLIKELY(s->max_chain_length > 1024)) {
-                s->ins_h = s->update_hash(s, s->window[str], s->window[str+1]);
+                s->ins_h = s->update_hash(s->window[str], s->window[str+1]);
            } else if (str >= 1) {
                s->quick_insert_string(s, str + 2 - STD_MIN_MATCH);
            }
--- a/3rdparty/zlib-ng/deflate.h
+++ b/3rdparty/zlib-ng/deflate.h
@ -12,8 +12,12 @@

 #include "zutil.h"
 #include "zendian.h"
-#include "adler32_fold.h"
-#include "crc32_fold.h"
+#include "crc32.h"
+
+#ifdef S390_DFLTCC_DEFLATE
+#  include "arch/s390/dfltcc_common.h"
+#  define HAVE_ARCH_DEFLATE_STATE
+#endif

 /* define NO_GZIP when compiling if you want to disable gzip header and
   trailer creation by deflate().  NO_GZIP would be used to avoid linking in
@ -23,6 +27,12 @@
 #  define GZIP
 #endif

+/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
+   the cost of a larger memory footprint */
+#ifndef NO_LIT_MEM
+#  define LIT_MEM
+#endif
+
 /* ===========================================================================
 * Internal compression state.
 */
@ -108,11 +118,30 @@ typedef uint16_t Pos;
 /* Type definitions for hash callbacks */
 typedef struct internal_state deflate_state;

-typedef uint32_t (* update_hash_cb)        (deflate_state *const s, uint32_t h, uint32_t val);
+typedef uint32_t (* update_hash_cb)        (uint32_t h, uint32_t val);
 typedef void     (* insert_string_cb)      (deflate_state *const s, uint32_t str, uint32_t count);
 typedef Pos      (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);

-struct internal_state {
+uint32_t update_hash             (uint32_t h, uint32_t val);
+void     insert_string           (deflate_state *const s, uint32_t str, uint32_t count);
+Pos      quick_insert_string     (deflate_state *const s, uint32_t str);
+
+uint32_t update_hash_roll        (uint32_t h, uint32_t val);
+void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
+Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
+
+/* Struct for memory allocation handling */
+typedef struct deflate_allocs_s {
+    char            *buf_start;
+    free_func        zfree;
+    deflate_state   *state;
+    unsigned char   *window;
+    unsigned char   *pending_buf;
+    Pos             *prev;
+    Pos             *head;
+} deflate_allocs;
+
+struct ALIGNED_(64) internal_state {
    PREFIX3(stream)      *strm;            /* pointer back to this zlib stream */
    unsigned char        *pending_buf;     /* output still pending */
    unsigned char        *pending_out;     /* next pending byte to output to the stream */
@ -260,8 +289,16 @@ struct internal_state {
     *   - I can't count above 4
     */

+#ifdef LIT_MEM
+#   define LIT_BUFS 5
+    uint16_t *d_buf;              /* buffer for distances */
+    unsigned char *l_buf;         /* buffer for literals/lengths */
+#else
+#   define LIT_BUFS 4
    unsigned char *sym_buf;       /* buffer for distances and literals/lengths */
-    unsigned int sym_next;        /* running index in sym_buf */
+#endif
+
+    unsigned int sym_next;        /* running index in symbol buffer */
    unsigned int sym_end;         /* symbol table full when sym_next reaches this */

    unsigned long opt_len;        /* bit length of current block with optimal trees */
@ -273,8 +310,11 @@ struct internal_state {
    unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
    unsigned long bits_sent;      /* bit length of compressed data sent mod 2^32 */

-    /* Reserved for future use and alignment purposes */
-    char *reserved_p;
+    deflate_allocs *alloc_bufs;
+
+#ifdef HAVE_ARCH_DEFLATE_STATE
+    arch_deflate_state arch;      /* architecture-specific extensions */
+#endif

    uint64_t bi_buf;
    /* Output buffer. bits are inserted starting at the bottom (least significant bits). */
@ -284,7 +324,7 @@ struct internal_state {

    /* Reserved for future use and alignment purposes */
    int32_t reserved[11];
-} ALIGNED_(8);
+};

 typedef enum {
    need_more,      /* block not completed, need more input or more output */
--- a/3rdparty/zlib-ng/deflate_fast.c
+++ b/3rdparty/zlib-ng/deflate_fast.c
@ -1,6 +1,6 @@
 /* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

@ -41,7 +41,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
         * dictionary, and set hash_head to the head of the hash chain:
         */
        if (s->lookahead >= WANT_MIN_MATCH) {
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);
            dist = (int64_t)s->strstart - hash_head;

            /* Find the longest match, discarding those <= prev_length.
@ -52,7 +52,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
                 * of window index 0 (in particular we have to avoid a match
                 * of the string with itself at the start of the input file).
                 */
-                match_len = functable.longest_match(s, hash_head);
+                match_len = FUNCTABLE_CALL(longest_match)(s, hash_head);
                /* longest_match() sets match_start */
            }
        }
@ -71,11 +71,11 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
                match_len--; /* string at strstart already in table */
                s->strstart++;

-                functable.insert_string(s, s->strstart, match_len);
+                insert_string(s, s->strstart, match_len);
                s->strstart += match_len;
            } else {
                s->strstart += match_len;
-                functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
+                quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);

                /* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
                 * matter since it will be recomputed at next deflate call.
--- a/3rdparty/zlib-ng/deflate_huff.c
+++ b/3rdparty/zlib-ng/deflate_huff.c
@ -1,6 +1,6 @@
 /* deflate_huff.c -- compress data using huffman encoding only strategy
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

--- a/3rdparty/zlib-ng/deflate_medium.c
+++ b/3rdparty/zlib-ng/deflate_medium.c
@ -45,16 +45,18 @@ static void insert_match(deflate_state *s, struct match match) {
    if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
        return;

+    /* string at strstart already in table */
+    match.strstart++;
+    match.match_length--;
+
    /* matches that are not long enough we need to emit as literals */
-    if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
-        match.strstart++;
-        match.match_length--;
+    if (LIKELY(match.match_length < WANT_MIN_MATCH - 1)) {
        if (UNLIKELY(match.match_length > 0)) {
            if (match.strstart >= match.orgstart) {
                if (match.strstart + match.match_length - 1 >= match.orgstart) {
-                    functable.insert_string(s, match.strstart, match.match_length);
+                    insert_string(s, match.strstart, match.match_length);
                } else {
-                    functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+                    insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
                }
                match.strstart += match.match_length;
                match.match_length = 0;
@ -63,35 +65,18 @@ static void insert_match(deflate_state *s, struct match match) {
        return;
    }

-    /* Insert new strings in the hash table only if the match length
-     * is not too large. This saves time but degrades compression.
-     */
-    if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
-        match.match_length--; /* string at strstart already in table */
-        match.strstart++;
-
-        if (LIKELY(match.strstart >= match.orgstart)) {
-            if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
-                functable.insert_string(s, match.strstart, match.match_length);
-            } else {
-                functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
-            }
-        } else if (match.orgstart < match.strstart + match.match_length) {
-            functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
+    /* Insert into hash table. */
+    if (LIKELY(match.strstart >= match.orgstart)) {
+        if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
+            insert_string(s, match.strstart, match.match_length);
+        } else {
+            insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
        }
-        match.strstart += match.match_length;
-        match.match_length = 0;
-    } else {
-        match.strstart += match.match_length;
-        match.match_length = 0;
-
-        if (match.strstart >= (STD_MIN_MATCH - 2))
-            functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
-
-        /* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
-         * matter since it will be recomputed at next deflate call.
-         */
+    } else if (match.orgstart < match.strstart + match.match_length) {
+        insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
    }
+    match.strstart += match.match_length;
+    match.match_length = 0;
 }

 static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
@ -199,7 +184,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
        } else {
            hash_head = 0;
            if (s->lookahead >= WANT_MIN_MATCH) {
-                hash_head = functable.quick_insert_string(s, s->strstart);
+                hash_head = quick_insert_string(s, s->strstart);
            }

            current_match.strstart = (uint16_t)s->strstart;
@ -215,7 +200,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
                 * of window index 0 (in particular we have to avoid a match
                 * of the string with itself at the start of the input file).
                 */
-                current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                current_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
                current_match.match_start = (uint16_t)s->match_start;
                if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
                    current_match.match_length = 1;
@ -235,7 +220,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
        /* now, look ahead one */
        if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
            s->strstart = current_match.strstart + current_match.match_length;
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);

            next_match.strstart = (uint16_t)s->strstart;
            next_match.orgstart = next_match.strstart;
@ -250,7 +235,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
                 * of window index 0 (in particular we have to avoid a match
                 * of the string with itself at the start of the input file).
                 */
-                next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                next_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
                next_match.match_start = (uint16_t)s->match_start;
                if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
                    /* this can happen due to some restarts */
--- a/3rdparty/zlib-ng/deflate_p.h
+++ b/3rdparty/zlib-ng/deflate_p.h
@ -1,7 +1,7 @@
 /* deflate_p.h -- Private inline functions and macros shared with more than
 *                one deflate method
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 */
@ -60,27 +60,37 @@ extern const unsigned char Z_INTERNAL zng_dist_code[];

 static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
    /* c is the unmatched char */
+#ifdef LIT_MEM
+    s->d_buf[s->sym_next] = 0;
+    s->l_buf[s->sym_next++] = c;
+#else
    s->sym_buf[s->sym_next++] = 0;
    s->sym_buf[s->sym_next++] = 0;
    s->sym_buf[s->sym_next++] = c;
+#endif
    s->dyn_ltree[c].Freq++;
    Tracevv((stderr, "%c", c));
    Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
    return (s->sym_next == s->sym_end);
 }

-static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
+static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t len) {
    /* dist: distance of matched string */
    /* len: match length-STD_MIN_MATCH */
+#ifdef LIT_MEM
+    s->d_buf[s->sym_next] = dist;
+    s->l_buf[s->sym_next++] = len;
+#else
    s->sym_buf[s->sym_next++] = (uint8_t)(dist);
    s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
    s->sym_buf[s->sym_next++] = (uint8_t)len;
+#endif
    s->matches++;
    dist--;
    Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
        "zng_tr_tally: bad match");

-    s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
+    s->dyn_ltree[zng_length_code[len] + LITERALS + 1].Freq++;
    s->dyn_dtree[d_code(dist)].Freq++;
    return (s->sym_next == s->sym_end);
 }
--- a/3rdparty/zlib-ng/deflate_quick.c
+++ b/3rdparty/zlib-ng/deflate_quick.c
@ -86,7 +86,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
        }

        if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
-            hash_head = functable.quick_insert_string(s, s->strstart);
+            hash_head = quick_insert_string(s, s->strstart);
            dist = (int64_t)s->strstart - hash_head;

            if (dist <= MAX_DIST(s) && dist > 0) {
@ -94,7 +94,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
                const uint8_t *match_start = s->window + hash_head;

                if (zng_memcmp_2(str_start, match_start) == 0) {
-                    match_len = functable.compare256(str_start+2, match_start+2) + 2;
+                    match_len = FUNCTABLE_CALL(compare256)(str_start+2, match_start+2) + 2;

                    if (match_len >= WANT_MIN_MATCH) {
                        if (UNLIKELY(match_len > s->lookahead))
--- a/3rdparty/zlib-ng/deflate_rle.c
+++ b/3rdparty/zlib-ng/deflate_rle.c
@ -1,6 +1,6 @@
 /* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

--- a/3rdparty/zlib-ng/deflate_slow.c
+++ b/3rdparty/zlib-ng/deflate_slow.c
@ -1,6 +1,6 @@
 /* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

@ -19,12 +19,12 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
    int bflush;              /* set if current block must be flushed */
    int64_t dist;
    uint32_t match_len;
-    match_func *longest_match;
+    match_func longest_match;

    if (s->max_chain_length <= 1024)
-        longest_match = &functable.longest_match;
+        longest_match = FUNCTABLE_FPTR(longest_match);
    else
-        longest_match = &functable.longest_match_slow;
+        longest_match = FUNCTABLE_FPTR(longest_match_slow);

    /* Process the input block. */
    for (;;) {
@ -61,7 +61,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
             * of window index 0 (in particular we have to avoid a match
             * of the string with itself at the start of the input file).
             */
-            match_len = (*longest_match)(s, hash_head);
+            match_len = longest_match(s, hash_head);
            /* longest_match() sets match_start */

            if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
@ -129,7 +129,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
    }
    Assert(flush != Z_NO_FLUSH, "no flush?");
    if (UNLIKELY(s->match_available)) {
-        (void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
+        Z_UNUSED(zng_tr_tally_lit(s, s->window[s->strstart-1]));
        s->match_available = 0;
    }
    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
--- a/3rdparty/zlib-ng/deflate_stored.c
+++ b/3rdparty/zlib-ng/deflate_stored.c
@ -1,6 +1,6 @@
 /* deflate_stored.c -- store data without compression using deflation algorithm
 *
- * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

@ -22,7 +22,7 @@
 *
 * deflate_stored() is written to minimize the number of times an input byte is
 * copied. It is most efficient with large input and output buffers, which
- * maximizes the opportunites to have a single copy from next_in to next_out.
+ * maximizes the opportunities to have a single copy from next_in to next_out.
 */
 Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
    /* Smallest worthy block size when not flushing or finishing. By default
--- a/3rdparty/zlib-ng/fallback_builtins.h
+++ b/3rdparty/zlib-ng/fallback_builtins.h
@ -5,9 +5,6 @@
 #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)

 #include <intrin.h>
-#ifdef X86_FEATURES
-#  include "arch/x86/x86_features.h"
-#endif

 /* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
 * Because of that assumption trailing_zero is not initialized and the return value is not checked.
--- a/3rdparty/zlib-ng/functable.c
+++ b/3rdparty/zlib-ng/functable.c
@ -2,14 +2,12 @@
 * Copyright (C) 2017 Hans Kristian Rosbach
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
+#ifndef DISABLE_RUNTIME_CPU_DETECTION

 #include "zbuild.h"
-#include "zendian.h"
-#include "crc32_braid_p.h"
-#include "deflate.h"
-#include "deflate_p.h"
 #include "functable.h"
 #include "cpu_features.h"
+#include "arch_functions.h"

 #if defined(_MSC_VER)
 #  include <intrin.h>
@ -61,31 +59,10 @@ static void init_functable(void) {
    ft.crc32_fold_final = &crc32_fold_final_c;
    ft.crc32_fold_reset = &crc32_fold_reset_c;
    ft.inflate_fast = &inflate_fast_c;
-    ft.insert_string = &insert_string_c;
-    ft.quick_insert_string = &quick_insert_string_c;
    ft.slide_hash = &slide_hash_c;
-    ft.update_hash = &update_hash_c;
-
-#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
-#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
-    ft.longest_match = &longest_match_unaligned_64;
-    ft.longest_match_slow = &longest_match_slow_unaligned_64;
-    ft.compare256 = &compare256_unaligned_64;
-#  elif defined(HAVE_BUILTIN_CTZ)
-    ft.longest_match = &longest_match_unaligned_32;
-    ft.longest_match_slow = &longest_match_slow_unaligned_32;
-    ft.compare256 = &compare256_unaligned_32;
-#  else
-    ft.longest_match = &longest_match_unaligned_16;
-    ft.longest_match_slow = &longest_match_slow_unaligned_16;
-    ft.compare256 = &compare256_unaligned_16;
-#  endif
-#else
-    ft.longest_match = &longest_match_c;
-    ft.longest_match_slow = &longest_match_slow_c;
-    ft.compare256 = &compare256_c;
-#endif
-
+    ft.longest_match = &longest_match_generic;
+    ft.longest_match_slow = &longest_match_slow_generic;
+    ft.compare256 = &compare256_generic;

    // Select arch-optimized functions

@ -110,19 +87,14 @@ static void init_functable(void) {
 #ifdef X86_SSSE3
    if (cf.x86.has_ssse3) {
        ft.adler32 = &adler32_ssse3;
-#  ifdef X86_SSE2
        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
        ft.inflate_fast = &inflate_fast_ssse3;
-#  endif
    }
 #endif
    // X86 - SSE4.2
 #ifdef X86_SSE42
    if (cf.x86.has_sse42) {
        ft.adler32_fold_copy = &adler32_fold_copy_sse42;
-        ft.insert_string = &insert_string_sse42;
-        ft.quick_insert_string = &quick_insert_string_sse42;
-        ft.update_hash = &update_hash_sse42;
    }
 #endif
    // X86 - PCLMUL
@ -151,8 +123,9 @@ static void init_functable(void) {
 #  endif
    }
 #endif
+    // X86 - AVX512 (F,DQ,BW,Vl)
 #ifdef X86_AVX512
-    if (cf.x86.has_avx512) {
+    if (cf.x86.has_avx512_common) {
        ft.adler32 = &adler32_avx512;
        ft.adler32_fold_copy = &adler32_fold_copy_avx512;
    }
@ -164,8 +137,8 @@ static void init_functable(void) {
    }
 #endif
    // X86 - VPCLMULQDQ
-#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
-    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
+#ifdef X86_VPCLMULQDQ_CRC
+    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
        ft.crc32 = &crc32_vpclmulqdq;
        ft.crc32_fold = &crc32_fold_vpclmulqdq;
        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
@ -206,9 +179,6 @@ static void init_functable(void) {
 #ifdef ARM_ACLE
    if (cf.arm.has_crc32) {
        ft.crc32 = &crc32_acle;
-        ft.insert_string = &insert_string_acle;
-        ft.quick_insert_string = &quick_insert_string_acle;
-        ft.update_hash = &update_hash_acle;
    }
 #endif

@ -279,12 +249,9 @@ static void init_functable(void) {
    FUNCTABLE_ASSIGN(ft, crc32_fold_final);
    FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
    FUNCTABLE_ASSIGN(ft, inflate_fast);
-    FUNCTABLE_ASSIGN(ft, insert_string);
    FUNCTABLE_ASSIGN(ft, longest_match);
    FUNCTABLE_ASSIGN(ft, longest_match_slow);
-    FUNCTABLE_ASSIGN(ft, quick_insert_string);
    FUNCTABLE_ASSIGN(ft, slide_hash);
-    FUNCTABLE_ASSIGN(ft, update_hash);

    // Memory barrier for weak memory order CPUs
    FUNCTABLE_BARRIER();
@ -350,11 +317,6 @@ static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
    functable.inflate_fast(strm, start);
 }

-static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
-    init_functable();
-    functable.insert_string(s, str, count);
-}
-
 static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
    init_functable();
    return functable.longest_match(s, cur_match);
@ -365,21 +327,11 @@ static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
    return functable.longest_match_slow(s, cur_match);
 }

-static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
-    init_functable();
-    return functable.quick_insert_string(s, str);
-}
-
 static void slide_hash_stub(deflate_state* s) {
    init_functable();
    functable.slide_hash(s);
 }

-static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
-    init_functable();
-    return functable.update_hash(s, h, val);
-}
-
 /* functable init */
 Z_INTERNAL struct functable_s functable = {
    force_init_stub,
@ -394,10 +346,9 @@ Z_INTERNAL struct functable_s functable = {
    crc32_fold_final_stub,
    crc32_fold_reset_stub,
    inflate_fast_stub,
-    insert_string_stub,
    longest_match_stub,
    longest_match_slow_stub,
-    quick_insert_string_stub,
    slide_hash_stub,
-    update_hash_stub
 };
+
+#endif
--- a/3rdparty/zlib-ng/functable.h
+++ b/3rdparty/zlib-ng/functable.h
@ -7,14 +7,21 @@
 #define FUNCTABLE_H_

 #include "deflate.h"
-#include "crc32_fold.h"
-#include "adler32_fold.h"
+#include "crc32.h"
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+
+#  include "arch_functions.h"
+
+/* When compiling with native instructions it is not necessary to use functable.
+ * Instead we use native_ macro indicating the best available variant of arch-specific
+ * functions for the current platform.
+ */
+#  define FUNCTABLE_INIT ((void)0)
+#  define FUNCTABLE_CALL(name) native_ ## name
+#  define FUNCTABLE_FPTR(name) &native_ ## name

-#ifdef ZLIB_COMPAT
-typedef struct z_stream_s z_stream;
 #else
-typedef struct zng_stream_s zng_stream;
-#endif

 struct functable_s {
    void     (* force_init)         (void);
@ -29,14 +36,20 @@ struct functable_s {
    uint32_t (* crc32_fold_final)   (struct crc32_fold_s *crc);
    uint32_t (* crc32_fold_reset)   (struct crc32_fold_s *crc);
    void     (* inflate_fast)       (PREFIX3(stream) *strm, uint32_t start);
-    void     (* insert_string)      (deflate_state *const s, uint32_t str, uint32_t count);
    uint32_t (* longest_match)      (deflate_state *const s, Pos cur_match);
    uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
-    Pos      (* quick_insert_string)(deflate_state *const s, uint32_t str);
    void     (* slide_hash)         (deflate_state *s);
-    uint32_t (* update_hash)        (deflate_state *const s, uint32_t h, uint32_t val);
 };

 Z_INTERNAL extern struct functable_s functable;

+
+/* Explicitly indicate functions are conditionally dispatched.
+ */
+#  define FUNCTABLE_INIT functable.force_init()
+#  define FUNCTABLE_CALL(name) functable.name
+#  define FUNCTABLE_FPTR(name) functable.name
+
+#endif
+
 #endif
--- a/Show More
+++ b/Show More