Add zlib-ng as an alternative zlib implementation

Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier.

Related discussion: https://github.com/opencv/opencv/issues/22573
pull/24782/head
Letu Ren 11 months ago
parent e80b7940ef
commit 0de26fd78e
  1. 8
      3rdparty/readme.txt
  2. 796
      3rdparty/zlib-ng/CMakeLists.txt
  3. 19
      3rdparty/zlib-ng/LICENSE.md
  4. 229
      3rdparty/zlib-ng/README.md
  5. 115
      3rdparty/zlib-ng/adler32.c
  6. 16
      3rdparty/zlib-ng/adler32_fold.c
  7. 11
      3rdparty/zlib-ng/adler32_fold.h
  8. 70
      3rdparty/zlib-ng/adler32_p.h
  9. 2
      3rdparty/zlib-ng/arch/.gitignore
  10. 85
      3rdparty/zlib-ng/arch/arm/Makefile.in
  11. 35
      3rdparty/zlib-ng/arch/arm/acle_intrins.h
  12. 215
      3rdparty/zlib-ng/arch/arm/adler32_neon.c
  13. 100
      3rdparty/zlib-ng/arch/arm/arm_features.c
  14. 16
      3rdparty/zlib-ng/arch/arm/arm_features.h
  15. 99
      3rdparty/zlib-ng/arch/arm/chunkset_neon.c
  16. 59
      3rdparty/zlib-ng/arch/arm/compare256_neon.c
  17. 78
      3rdparty/zlib-ng/arch/arm/crc32_acle.c
  18. 24
      3rdparty/zlib-ng/arch/arm/insert_string_acle.c
  19. 58
      3rdparty/zlib-ng/arch/arm/neon_intrins.h
  20. 47
      3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
  21. 46
      3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
  22. 24
      3rdparty/zlib-ng/arch/generic/Makefile.in
  23. 53
      3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
  24. 93
      3rdparty/zlib-ng/arch/power/Makefile.in
  25. 153
      3rdparty/zlib-ng/arch/power/adler32_power8.c
  26. 186
      3rdparty/zlib-ng/arch/power/adler32_vmx.c
  27. 55
      3rdparty/zlib-ng/arch/power/chunkset_power8.c
  28. 64
      3rdparty/zlib-ng/arch/power/compare256_power9.c
  29. 1123
      3rdparty/zlib-ng/arch/power/crc32_constants.h
  30. 589
      3rdparty/zlib-ng/arch/power/crc32_power8.c
  31. 31
      3rdparty/zlib-ng/arch/power/fallback_builtins.h
  32. 46
      3rdparty/zlib-ng/arch/power/power_features.c
  33. 18
      3rdparty/zlib-ng/arch/power/power_features.h
  34. 12
      3rdparty/zlib-ng/arch/power/slide_hash_power8.c
  35. 10
      3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
  36. 31
      3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
  37. 45
      3rdparty/zlib-ng/arch/riscv/README.md
  38. 132
      3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
  39. 121
      3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
  40. 47
      3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
  41. 45
      3rdparty/zlib-ng/arch/riscv/riscv_features.c
  42. 18
      3rdparty/zlib-ng/arch/riscv/riscv_features.h
  43. 34
      3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
  44. 147
      3rdparty/zlib-ng/arch/x86/Makefile.in
  45. 154
      3rdparty/zlib-ng/arch/x86/adler32_avx2.c
  46. 32
      3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
  47. 115
      3rdparty/zlib-ng/arch/x86/adler32_avx512.c
  48. 46
      3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
  49. 225
      3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
  50. 121
      3rdparty/zlib-ng/arch/x86/adler32_sse42.c
  51. 156
      3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
  52. 29
      3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
  53. 133
      3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
  54. 56
      3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
  55. 101
      3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
  56. 63
      3rdparty/zlib-ng/arch/x86/compare256_avx2.c
  57. 96
      3rdparty/zlib-ng/arch/x86/compare256_sse2.c
  58. 186
      3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
  59. 107
      3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
  60. 30
      3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
  61. 363
      3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
  62. 17
      3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
  63. 24
      3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
  64. 39
      3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
  65. 62
      3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
  66. 97
      3rdparty/zlib-ng/arch/x86/x86_features.c
  67. 24
      3rdparty/zlib-ng/arch/x86/x86_features.h
  68. 87
      3rdparty/zlib-ng/arch/x86/x86_intrins.h
  69. 42
      3rdparty/zlib-ng/chunkset.c
  70. 200
      3rdparty/zlib-ng/chunkset_tpl.h
  71. 543
      3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
  72. 19
      3rdparty/zlib-ng/cmake/fallback-macros.cmake
  73. 180
      3rdparty/zlib-ng/compare256.c
  74. 134
      3rdparty/zlib-ng/compare256_rle.h
  75. 98
      3rdparty/zlib-ng/compress.c
  76. 23
      3rdparty/zlib-ng/cpu_features.c
  77. 303
      3rdparty/zlib-ng/cpu_features.h
  78. 267
      3rdparty/zlib-ng/crc32_braid.c
  79. 57
      3rdparty/zlib-ng/crc32_braid_comb.c
  80. 42
      3rdparty/zlib-ng/crc32_braid_comb_p.h
  81. 50
      3rdparty/zlib-ng/crc32_braid_p.h
  82. 9446
      3rdparty/zlib-ng/crc32_braid_tbl.h
  83. 33
      3rdparty/zlib-ng/crc32_fold.c
  84. 21
      3rdparty/zlib-ng/crc32_fold.h
  85. 1410
      3rdparty/zlib-ng/deflate.c
  86. 408
      3rdparty/zlib-ng/deflate.h
  87. 102
      3rdparty/zlib-ng/deflate_fast.c
  88. 45
      3rdparty/zlib-ng/deflate_huff.c
  89. 293
      3rdparty/zlib-ng/deflate_medium.c
  90. 116
      3rdparty/zlib-ng/deflate_p.h
  91. 129
      3rdparty/zlib-ng/deflate_quick.c
  92. 85
      3rdparty/zlib-ng/deflate_rle.c
  93. 143
      3rdparty/zlib-ng/deflate_slow.c
  94. 186
      3rdparty/zlib-ng/deflate_stored.c
  95. 50
      3rdparty/zlib-ng/fallback_builtins.h
  96. 403
      3rdparty/zlib-ng/functable.c
  97. 42
      3rdparty/zlib-ng/functable.h
  98. 144
      3rdparty/zlib-ng/gzguts.h
  99. 525
      3rdparty/zlib-ng/gzlib.c
  100. 606
      3rdparty/zlib-ng/gzread.c.in
  101. Some files were not shown because too many files have changed in this diff Show More

@ -49,6 +49,14 @@ zlib General purpose LZ77 compression library
Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler.
See zlib home page http://www.zlib.net
for details and links to the source code
zlib-ng zlib data compression library for the next generation systems
(C) 1995-2013 Jean-loup Gailly and Mark Adler
See zlib-ng official GitHub repository
https://github.com/zlib-ng/zlib-ng.git
for details and links to source code
WITH_ZLIB_NG CMake option must be ON to use zlib-ng as the zlib implementation.
------------------------------------------------------------------------------------
jasper JasPer is a collection of software
(i.e., a library and application programs) for the coding

@ -0,0 +1,796 @@
project(${ZLIB_LIBRARY} LANGUAGES C)
if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES)
set(CMAKE_C_STANDARD 11) # The C standard whose features are requested to build this target
else()
set(CMAKE_C_STANDARD 99)
endif()
set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
set(CMAKE_C_EXTENSIONS OFF) # Boolean specifying whether compiler specific extensions are requested
include(CheckTypeSize)
include(CheckSymbolExists)
include(CheckFunctionExists)
include(CheckIncludeFile)
include(CheckCSourceCompiles)
include(CheckCSourceRuns)
include(CheckCCompilerFlag)
include(CMakeDependentOption)
if(X86_64 OR X86)
set(BASEARCH_X86_FOUND TRUE)
endif()
if(AARCH64 OR ARM)
set(BASEARCH_ARM_FOUND TRUE)
endif()
if(PPC64LE OR PPC64)
set(BASEARCH_PPC_FOUND TRUE)
endif()
if(RISCV)
set(BASEARCH_RISCV_FOUND TRUE)
endif()
include(cmake/detect-intrinsics.cmake)
include(cmake/fallback-macros.cmake)
set(ZLIB_SYMBOL_PREFIX "")
if(BASEARCH_X86_FOUND)
set(WITH_AVX2 ON)
set(WITH_AVX512 ON)
set(WITH_AVX512VNNI ON)
set(WITH_SSE2 ON)
set(WITH_SSSE3 ON)
set(WITH_SSE42 ON)
set(WITH_PCLMULQDQ ON)
set(WITH_VPCLMULQDQ ON)
endif()
if(BASEARCH_ARM_FOUND)
set(WITH_ACLE ON)
set(WITH_NEON ON)
if(ARM)
set(WITH_ARMV6 ON)
else()
set(WITH_ARMV6 OFF)
endif()
endif()
if(BASEARCH_PPC_FOUND)
set(WITH_ALTIVEC ON)
set(WITH_POWER8 ON)
set(WITH_POWER9 ON)
endif()
if(BASEARCH_RISCV_FOUND)
set(WITH_RVV ON)
endif()
add_definitions(-DZLIB_COMPAT)
add_definitions(-DWITH_GZFILEOP)
if(CMAKE_C_COMPILER_ID MATCHES "^Intel")
set(WARNFLAGS_DISABLE)
elseif(MSVC)
# Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
# See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
if(MSVC_VERSION VERSION_LESS 1800)
message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
endif()
# TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
# (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
# avoid mistakes.
# /Oi ?
set(WARNFLAGS_DISABLE)
if(BASEARCH_ARM_FOUND)
add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
if(NOT "${ARCH}" MATCHES "aarch64")
set(NEONFLAG "/arch:VFPv4")
endif()
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(WARNFLAGS_DISABLE)
# Check whether -fno-lto is available
set(CMAKE_REQUIRED_FLAGS "-fno-lto")
check_c_source_compiles(
"int main() { return 0; }"
FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
if(FNO_LTO_AVAILABLE)
set(ZNOLTOFLAG "-fno-lto")
endif()
if(BASEARCH_ARM_FOUND)
if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
# Auto-detect support for ARM floating point ABI
check_include_file(features.h HAVE_FEATURES_H)
if(HAVE_FEATURES_H)
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
check_c_source_compiles(
"#include <features.h>
int main() { return 0; }"
HAVE_FLOATABI_SOFTFP)
if(HAVE_FLOATABI_SOFTFP)
set(FLOATABI -mfloat-abi=softfp)
else()
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
check_c_source_compiles(
"#include <features.h>
int main() { return 0; }"
HAVE_FLOATABI_HARD)
if(HAVE_FLOATABI_HARD)
set(FLOATABI -mfloat-abi=hard)
endif()
endif()
set(CMAKE_REQUIRED_FLAGS)
endif()
if(FLOATABI)
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}")
add_compile_options(${FLOATABI})
else()
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected")
endif()
endif()
endif()
if(FNO_LTO_AVAILABLE)
set(NOLTOFLAG ${ZNOLTOFLAG})
endif()
if(MINGW)
# Add `-Wno-pedantic-ms-format` only if the toolchain supports it
check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
if(HAVE_NO_PEDANTIC_MS_FORMAT)
list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
endif()
endif()
endif()
# Force disable LTO
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
# Apply warning compiler flags
add_compile_options(${WARNFLAGS_DISABLE})
# Replace optimization level 3 added by default with level 2
if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
endif()
#
# Check for standard/system includes
#
check_include_file(arm_acle.h HAVE_ARM_ACLE_H)
if(HAVE_ARM_ACLE_H)
add_definitions(-DHAVE_ARM_ACLE_H)
endif()
check_include_file(sys/auxv.h HAVE_SYS_AUXV_H)
if(HAVE_SYS_AUXV_H)
add_definitions(-DHAVE_SYS_AUXV_H)
endif()
check_include_file(sys/sdt.h HAVE_SYS_SDT_H)
if(HAVE_SYS_SDT_H)
add_definitions(-DHAVE_SYS_SDT_H)
endif()
check_include_file(unistd.h HAVE_UNISTD_H)
#
# Check to see if we have large file support
#
set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
check_type_size(off64_t OFF64_T)
if(HAVE_OFF64_T)
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
else()
check_type_size(_off64_t _OFF64_T)
if(HAVE__OFF64_T)
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
else()
check_type_size(__off64_t __OFF64_T)
endif()
endif()
set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
#
# Check for fseeko and other optional functions
#
check_function_exists(fseeko HAVE_FSEEKO)
if(NOT HAVE_FSEEKO)
add_definitions(-DNO_FSEEKO)
endif()
check_function_exists(strerror HAVE_STRERROR)
if(NOT HAVE_STRERROR)
add_definitions(-DNO_STRERROR)
endif()
set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L)
check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
if(HAVE_POSIX_MEMALIGN)
add_definitions(-DHAVE_POSIX_MEMALIGN)
endif()
set(CMAKE_REQUIRED_DEFINITIONS)
set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1)
check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC)
if(HAVE_ALIGNED_ALLOC)
add_definitions(-DHAVE_ALIGNED_ALLOC)
endif()
set(CMAKE_REQUIRED_DEFINITIONS)
#
# Check if we can hide zlib internal symbols that are linked between separate source files using hidden
#
check_c_source_compiles(
"#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
int Z_INTERNAL foo;
int main() {
return 0;
}"
HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
add_definitions(-DHAVE_VISIBILITY_HIDDEN)
endif()
#
# Check if we can hide zlib internal symbols that are linked between separate source files using internal
#
check_c_source_compiles(
"#define Z_INTERNAL __attribute__((visibility (\"internal\")))
int Z_INTERNAL foo;
int main() {
return 0;
}"
HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
add_definitions(-DHAVE_VISIBILITY_INTERNAL)
endif()
#
# Check for __attribute__((aligned(x))) support in the compiler
#
check_c_source_compiles(
"int main(void) {
__attribute__((aligned(8))) int test = 0;
(void)test;
return 0;
}"
HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
if(HAVE_ATTRIBUTE_ALIGNED)
add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
endif()
#
# check for __builtin_ctz() support in the compiler
#
check_c_source_compiles(
"int main(void) {
unsigned int zero = 0;
long test = __builtin_ctz(zero);
(void)test;
return 0;
}"
HAVE_BUILTIN_CTZ
)
if(HAVE_BUILTIN_CTZ)
add_definitions(-DHAVE_BUILTIN_CTZ)
endif()
#
# check for __builtin_ctzll() support in the compiler
#
check_c_source_compiles(
"int main(void) {
unsigned int zero = 0;
long test = __builtin_ctzll(zero);
(void)test;
return 0;
}"
HAVE_BUILTIN_CTZLL
)
if(HAVE_BUILTIN_CTZLL)
add_definitions(-DHAVE_BUILTIN_CTZLL)
endif()
#
# check for ptrdiff_t support
#
check_c_source_compiles(
"#include <stddef.h>
int main() {
ptrdiff_t *a;
(void)a;
return 0;
}"
HAVE_PTRDIFF_T
)
if(NOT HAVE_PTRDIFF_T)
set(NEED_PTRDIFF_T 1)
check_type_size("void *" SIZEOF_DATA_PTR)
message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
if(${SIZEOF_DATA_PTR} MATCHES "4")
set(PTRDIFF_TYPE "uint32_t")
elseif(${SIZEOF_DATA_PTR} MATCHES "8")
set(PTRDIFF_TYPE "uint64_t")
else()
message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
endif()
endif()
if(MSVC)
add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
endif()
set(ZLIB_ARCH_SRCS)
set(ZLIB_ARCH_HDRS)
set(ARCHDIR "arch/generic")
if(BASEARCH_X86_FOUND)
set(ARCHDIR "arch/x86")
endif()
if(BASEARCH_ARM_FOUND)
set(ARCHDIR "arch/arm")
endif()
if(BASEARCH_PPC_FOUND)
set(ARCHDIR "arch/power")
endif()
if(BASEARCH_RISCV_FOUND)
set(ARCHDIR "arch/riscv")
endif()
if(NOT CV_DISABLE_OPTIMIZATION)
if(BASEARCH_ARM_FOUND)
add_definitions(-DARM_FEATURES)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
if("${ARCH}" MATCHES "aarch64")
check_c_source_compiles(
"#include <sys/auxv.h>
int main() {
return (getauxval(AT_HWCAP) & HWCAP_CRC32);
}"
ARM_AUXV_HAS_CRC32
)
if(ARM_AUXV_HAS_CRC32)
add_definitions(-DARM_AUXV_HAS_CRC32)
else()
message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
endif()
else()
check_c_source_compiles(
"#include <sys/auxv.h>
int main() {
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
}"
ARM_AUXV_HAS_CRC32
)
if(ARM_AUXV_HAS_CRC32)
add_definitions(-DARM_AUXV_HAS_CRC32)
else()
check_c_source_compiles(
"#include <sys/auxv.h>
#include <asm/hwcap.h>
int main() {
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
}"
ARM_HWCAP_HAS_CRC32
)
if(ARM_HWCAP_HAS_CRC32)
add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
else()
message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
endif()
endif()
check_c_source_compiles(
"#include <sys/auxv.h>
int main() {
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
}"
ARM_AUXV_HAS_NEON
)
if(ARM_AUXV_HAS_NEON)
add_definitions(-DARM_AUXV_HAS_NEON)
else()
check_c_source_compiles(
"#include <sys/auxv.h>
int main() {
return (getauxval(AT_HWCAP) & HWCAP_NEON);
}"
ARM_AUXV_HAS_NEON
)
if (ARM_AUXV_HAS_NEON)
add_definitions(-DARM_AUXV_HAS_NEON)
else()
message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
endif()
endif()
endif()
endif()
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
if(WITH_ACLE)
check_acle_compiler_flag()
if(HAVE_ACLE_FLAG)
add_definitions(-DARM_ACLE)
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
else()
set(WITH_ACLE OFF)
endif()
else()
set(WITH_ACLE OFF)
endif()
if(WITH_NEON)
check_neon_compiler_flag()
if(NEON_AVAILABLE)
add_definitions(-DARM_NEON)
set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
if(MSVC)
add_definitions(-D__ARM_NEON__)
endif()
check_neon_ld4_intrinsics()
if(NEON_HAS_LD4)
add_definitions(-DARM_NEON_HASLD4)
endif()
else()
set(WITH_NEON OFF)
endif()
endif()
if(WITH_ARMV6)
check_armv6_compiler_flag()
if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
add_definitions(-DARM_SIMD)
set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
if(HAVE_ARMV6_INTRIN)
add_definitions(-DARM_SIMD_INTRIN)
endif()
else()
set(WITH_ARMV6 OFF)
endif()
else()
set(WITH_ARMV6 OFF)
endif()
endif()
if(BASEARCH_PPC_FOUND)
# Common arch detection code
if(WITH_ALTIVEC)
check_ppc_intrinsics()
endif()
if(WITH_POWER8)
check_power8_intrinsics()
endif()
if(WITH_POWER9)
check_power9_intrinsics()
endif()
if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
add_definitions(-DPOWER_FEATURES)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
endif()
# VMX specific options and files
if(WITH_ALTIVEC)
if(HAVE_VMX)
add_definitions(-DPPC_FEATURES)
if(HAVE_ALTIVEC)
add_definitions(-DPPC_VMX)
set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
else()
set(WITH_ALTIVEC OFF)
endif()
endif()
endif()
# Power8 specific options and files
if(WITH_POWER8)
if(HAVE_POWER8_INTRIN)
add_definitions(-DPOWER8_VSX)
set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
if("${ARCH}" MATCHES "powerpc64(le)?")
add_definitions(-DPOWER8_VSX_CRC32)
list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
endif()
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
else()
set(WITH_POWER8 OFF)
endif()
endif()
# Power9 specific options and files
if(WITH_POWER9)
if(HAVE_POWER9_INTRIN)
add_definitions(-DPOWER9)
set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
else()
set(WITH_POWER9 OFF)
endif()
endif()
endif()
if(BASEARCH_RISCV_FOUND)
if(WITH_RVV)
check_rvv_intrinsics()
if(HAVE_RVV_INTRIN)
add_definitions(-DRISCV_FEATURES)
add_definitions(-DRISCV_RVV)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
# FIXME: we will not set compile flags for riscv_features.c when
# the kernels update hwcap or hwprobe for riscv
set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
else()
set(WITH_RVV OFF)
endif()
endif()
endif()
if(BASEARCH_X86_FOUND)
add_definitions(-DX86_FEATURES)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
if(MSVC)
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
endif()
if(WITH_AVX2)
check_avx2_intrinsics()
if(HAVE_AVX2_INTRIN)
add_definitions(-DX86_AVX2)
set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
else()
set(WITH_AVX2 OFF)
endif()
endif()
if(WITH_AVX512)
check_avx512_intrinsics()
if(HAVE_AVX512_INTRIN)
add_definitions(-DX86_AVX512)
list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
if(HAVE_MASK_INTRIN)
add_definitions(-DX86_MASK_INTRIN)
endif()
set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
else()
set(WITH_AVX512 OFF)
endif()
endif()
if(WITH_AVX512VNNI)
check_avx512vnni_intrinsics()
if(HAVE_AVX512VNNI_INTRIN)
add_definitions(-DX86_AVX512VNNI)
list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
else()
set(WITH_AVX512VNNI OFF)
endif()
endif()
if(WITH_SSE42)
check_sse42_intrinsics()
if(HAVE_SSE42_INTRIN)
add_definitions(-DX86_SSE42)
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
else()
set(WITH_SSE42 OFF)
endif()
endif()
if(WITH_SSE2)
check_sse2_intrinsics()
if(HAVE_SSE2_INTRIN)
add_definitions(-DX86_SSE2)
set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
if(NOT ${ARCH} MATCHES "x86_64")
set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
add_definitions(-DX86_NOCHECK_SSE2)
endif()
else()
set(WITH_SSE2 OFF)
endif()
endif()
if(WITH_SSSE3)
check_ssse3_intrinsics()
if(HAVE_SSSE3_INTRIN)
add_definitions(-DX86_SSSE3)
set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
else()
set(WITH_SSSE3 OFF)
endif()
endif()
if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
check_pclmulqdq_intrinsics()
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
add_definitions(-DX86_PCLMULQDQ_CRC)
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
if(WITH_VPCLMULQDQ AND WITH_AVX512)
check_vpclmulqdq_intrinsics()
if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
add_definitions(-DX86_VPCLMULQDQ_CRC)
set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
else()
set(WITH_VPCLMULQDQ OFF)
endif()
else()
set(WITH_VPCLMULQDQ OFF)
endif()
else()
set(WITH_PCLMULQDQ OFF)
set(WITH_VPCLMULQDQ OFF)
endif()
else()
set(WITH_PCLMULQDQ OFF)
set(WITH_VPCLMULQDQ OFF)
endif()
check_xsave_intrinsics()
if(HAVE_XSAVE_INTRIN)
set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
endif()
endif()
endif()
#============================================================================
# zconf.h
#============================================================================
macro(generate_cmakein input output)
file(REMOVE ${output})
file(STRINGS ${input} _lines)
foreach(_line IN LISTS _lines)
string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
if(NEED_PTRDIFF_T)
string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
endif()
file(APPEND ${output} "${_line}\n")
endforeach()
endmacro(generate_cmakein)
generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein )
#============================================================================
# zlib
#============================================================================
set(ZLIB_PUBLIC_HDRS
${CMAKE_CURRENT_BINARY_DIR}/zconf.h
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h
${CMAKE_CURRENT_BINARY_DIR}/zlib.h
)
set(ZLIB_PRIVATE_HDRS
adler32_p.h
chunkset_tpl.h
compare256_rle.h
cpu_features.h
crc32_braid_p.h
crc32_braid_comb_p.h
crc32_braid_tbl.h
crc32_fold.h
deflate.h
deflate_p.h
functable.h
inffast_tpl.h
inffixed_tbl.h
inflate.h
inflate_p.h
inftrees.h
insert_string_tpl.h
match_tpl.h
trees.h
trees_emit.h
trees_tbl.h
zbuild.h
zendian.h
zutil.h
)
set(ZLIB_SRCS
adler32.c
adler32_fold.c
chunkset.c
compare256.c
compress.c
cpu_features.c
crc32_braid.c
crc32_braid_comb.c
crc32_fold.c
deflate.c
deflate_fast.c
deflate_huff.c
deflate_medium.c
deflate_quick.c
deflate_rle.c
deflate_slow.c
deflate_stored.c
functable.c
infback.c
inflate.c
inftrees.c
insert_string.c
insert_string_roll.c
slide_hash.c
trees.c
uncompr.c
zutil.c
)
set(ZLIB_GZFILE_PRIVATE_HDRS
gzguts.h
)
set(ZLIB_GZFILE_SRCS
gzlib.c
${CMAKE_CURRENT_BINARY_DIR}/gzread.c
gzwrite.c
)
set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
add_library(zlib STATIC ${ZLIB_ALL_SRCS})
target_include_directories(zlib PUBLIC
"$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
if(HAVE_UNISTD_H)
SET(ZCONF_UNISTD_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */")
else()
SET(ZCONF_UNISTD_LINE "#if 0 /* was set to #if 0 by configure/cmake/etc */")
endif()
if(NEED_PTRDIFF_T)
SET(ZCONF_PTRDIFF_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */")
else()
SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */")
endif()
configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein
${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in
${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes
-Wundef
-Wmissing-declarations
)
set_target_properties(${ZLIB_LIBRARY} PROPERTIES
OUTPUT_NAME ${ZLIB_LIBRARY}
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
COMPILE_PDB_NAME ${ZLIB_LIBRARY}
COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty")
endif()
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md)

@ -0,0 +1,19 @@
(C) 1995-2013 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.

@ -0,0 +1,229 @@
| CI | Stable | Develop |
|:---|:-------|:--------|
| GitHub Actions | [![Stable CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Astable) <br> [![Stable Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Astable) <br> [![Stable NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Astable) | [![Develop CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Adevelop) <br> [![Develop Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Adevelop) <br> [![Develop NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Adevelop) |
| CodeFactor | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
| OSS-Fuzz | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
| Codecov | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
## zlib-ng
*zlib data compression library for the next generation systems*
Maintained by Hans Kristian Rosbach
aka Dead2 (zlib-ng àt circlestorm dót org)
Features
--------
* Zlib compatible API with support for dual-linking
* Modernized native API based on zlib API for ease of porting
* Modern C11 syntax and a clean code layout
* Deflate medium and quick algorithms based on Intel’s zlib fork
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
* Includes improvements from Cloudflare and Intel forks
* Configure, CMake, and NMake build system support
* Comprehensive set of CMake unit tests
* Code sanitizers, fuzzing, and coverage
* GitHub Actions continuous integration on Windows, macOS, and Linux
* Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
History
-------
The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
implemented into the official zlib repository.
Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
lower threshold for code change.
zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
The result is a better performing and easier to maintain zlib-ng.
A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
small and big improvements, or valuable testing.
Build
-----
<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
There are two ways to build zlib-ng:
### Cmake
To build zlib-ng using the cross-platform makefile generator cmake.
```
cmake .
cmake --build . --config Release
ctest --verbose -C Release
```
Alternatively, you can use the cmake configuration GUI tool ccmake:
```
ccmake .
```
### Configure
To build zlib-ng using the bash configure script:
```
./configure
make
make test
```
Build Options
-------------
| CMake | configure | Description | Default |
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_GTEST | | Build gtest_zlib | ON |
| WITH_FUZZERS | | Build test/fuzz | OFF |
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
Install
-------
WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
can make the whole system unusable, requiring recovery or reinstall.
If you still want a manual install, we recommend using the /opt/ path prefix.
For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
```
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
```
### Cmake
To install zlib-ng system-wide using cmake:
```sh or powershell
cmake --build . --target install
```
### Configure
To install zlib-ng system-wide using the configure script:
```sh
make install
```
### CPack
After building with cmake, an installation package can be created using cpack. By default a tgz package is created,
but you can append `-G <format>` to each command to generate alternative packages types (TGZ, ZIP, RPM, DEB). To easily
create a rpm or deb package, you would use `-G RPM` or `-G DEB` respectively.
```sh or powershell
cd build
cpack --config CPackConfig.cmake
cpack --config CPackSourceConfig.cmake
```
### Vcpkg
Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
```sh or powershell
git clone https://github.com/Microsoft/vcpkg.git
cd vcpkg
./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
./vcpkg integrate install
./vcpkg install zlib-ng
```
The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
Contributing
------------
Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
Help with testing and reviewing pull requests etc is also very much appreciated.
Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
Acknowledgments
----------------
Thanks go out to all the people and companies who have taken the time to contribute
code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
The deflate format used by zlib was defined by Phil Katz.<br>
The deflate and zlib specifications were written by L. Peter Deutsch.
zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
Advanced Build Options
----------------------
| CMake | configure | Description | Default |
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON |
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
| WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON |
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
| WITH_RVV | | Build with RVV intrinsics | ON |
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON |
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF |
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF |
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON |
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF |
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF |
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF |
| ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON |
Related Projects
----------------
* Fork of the popular minizip https://github.com/zlib-ng/minizip-ng
* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
* Python tool to benchmark pigz https://github.com/zlib-ng/pigzbench
* 3rd party patches for zlib-ng compatibility https://github.com/zlib-ng/patches

@ -0,0 +1,115 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
return functable.adler32(adler, buf, len);
}
#endif
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
return functable.adler32(adler, buf, len);
}
#endif
/* ========================================================================= */
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
uint32_t sum1;
uint32_t sum2;
unsigned rem;
/* for negative len, return invalid adler32 as a clue for debugging */
if (len2 < 0)
return 0xffffffff;
/* the derivation of this formula is left as an exercise for the reader */
len2 %= BASE; /* assumes len2 >= 0 */
rem = (unsigned)len2;
sum1 = adler1 & 0xffff;
sum2 = rem * sum1;
sum2 %= BASE;
sum1 += (adler2 & 0xffff) + BASE - 1;
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
if (sum1 >= BASE) sum1 -= BASE;
if (sum1 >= BASE) sum1 -= BASE;
if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
if (sum2 >= BASE) sum2 -= BASE;
return sum1 | (sum2 << 16);
}
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
}
unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
}
#else
uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
return adler32_combine_(adler1, adler2, len2);
}
#endif

@ -0,0 +1,16 @@
/* adler32_fold.c -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_fold.h"
#include <limits.h>
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
memcpy(dst, src, len);
return adler;
}

@ -0,0 +1,11 @@
/* adler32_fold.h -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif

@ -0,0 +1,70 @@
/* adler32_p.h -- Private inline functions and macros shared with
* different computation of the Adler-32 checksum
* of a data stream.
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_P_H
#define ADLER32_P_H
#define BASE 65521U /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);}
#define DO2(sum1, sum2, buf, i) {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
#define DO4(sum1, sum2, buf, i) {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
adler += buf[0];
adler %= BASE;
sum2 += adler;
sum2 %= BASE;
return adler | (sum2 << 16);
}
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
return adler | (sum2 << 16);
}
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
while (len--) {
*dst = *buf++;
adler += *dst++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
return adler | (sum2 << 16);
}
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
DO16(adler, sum2, buf);
buf += 16;
#else
while (len >= 8) {
len -= 8;
DO8(adler, sum2, buf, 0);
buf += 8;
#endif
}
/* Process tail (len < 16). */
return adler32_len_16(adler, buf, len, sum2);
}
#endif /* ADLER32_P_H */

@ -0,0 +1,2 @@
# ignore Makefiles; they're all automatically generated
Makefile

@ -0,0 +1,85 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
ACLEFLAG=
NEONFLAG=
ARMV6FLAG=
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
adler32_neon.o adler32_neon.lo \
arm_features.o arm_features.lo \
chunkset_neon.o chunkset_neon.lo \
compare256_neon.o compare256_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
slide_hash_armv6.o slide_hash_armv6.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
adler32_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
arm_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
arm_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
chunkset_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
chunkset_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
compare256_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
compare256_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
crc32_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
crc32_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
slide_hash_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
slide_hash_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
slide_hash_armv6.o:
$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
slide_hash_armv6.lo:
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
insert_string_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

@ -0,0 +1,35 @@
#ifndef ARM_ACLE_INTRINS_H
#define ARM_ACLE_INTRINS_H
#include <stdint.h>
#ifdef _MSC_VER
# include <intrin.h>
#elif defined(HAVE_ARM_ACLE_H)
# include <arm_acle.h>
#endif
#ifdef ARM_ACLE
#if defined(__aarch64__)
# define Z_TARGET_CRC Z_TARGET("+crc")
#else
# define Z_TARGET_CRC
#endif
#endif
#ifdef ARM_SIMD
#ifdef _MSC_VER
typedef uint32_t uint16x2_t;
#define __uqsub16 _arm_uqsub16
#elif !defined(ARM_SIMD_INTRIN)
typedef uint32_t uint16x2_t;
static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
uint16x2_t __c;
__asm__ __volatile__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
return __c;
}
#endif
#endif
#endif // include guard ARM_ACLE_INTRINS_H

@ -0,0 +1,215 @@
/* Copyright (C) 1995-2011, 2016 Mark Adler
* Copyright (C) 2017 ARM Holdings Inc.
* Authors:
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
* Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../adler32_p.h"
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41,
40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1 };
uint32x4_t adacc = vdupq_n_u32(0);
uint32x4_t s2acc = vdupq_n_u32(0);
uint32x4_t s2acc_0 = vdupq_n_u32(0);
uint32x4_t s2acc_1 = vdupq_n_u32(0);
uint32x4_t s2acc_2 = vdupq_n_u32(0);
adacc = vsetq_lane_u32(s[0], adacc, 0);
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
uint32x4_t s3acc = vdupq_n_u32(0);
uint32x4_t adacc_prev = adacc;
uint16x8_t s2_0, s2_1, s2_2, s2_3;
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
uint16x8_t s2_4, s2_5, s2_6, s2_7;
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
size_t num_iter = len >> 2;
int rem = len & 3;
for (size_t i = 0; i < num_iter; ++i) {
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
* bit instruction, we'll have to make due summing to 16 bits first */
uint16x8x2_t hsum, hsum_fold;
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
s3acc = vaddq_u32(s3acc, adacc_prev);
adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
/* If we do straight widening additions to the 16 bit values, we don't incur
* the usual penalties of a pairwise add. We can defer the multiplications
* until the very end. These will not overflow because we are incurring at
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
* summed into once. This means for the maximum input size, the largest value
* we will see is 255 * 102 = 26010, safely under uint16 max */
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
adacc_prev = adacc;
buf += 64;
}
s3acc = vshlq_n_u32(s3acc, 6);
if (rem) {
uint32x4_t s3acc_0 = vdupq_n_u32(0);
while (rem--) {
uint8x16_t d0 = vld1q_u8(buf);
uint16x8_t adler;
adler = vpaddlq_u8(d0);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
s2_7 = vaddw_high_u8(s2_7, d0);
adacc = vpadalq_u16(adacc, adler);
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
adacc_prev = adacc;
buf += 16;
}
s3acc_0 = vshlq_n_u32(s3acc_0, 4);
s3acc = vaddq_u32(s3acc_0, s3acc);
}
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
s2acc = vaddq_u32(s2acc_0, s2acc);
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
s2acc = vaddq_u32(s2acc, s2acc_2);
uint32x2_t adacc2, s2acc2, as;
s2acc = vaddq_u32(s2acc, s3acc);
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
as = vpadd_u32(adacc2, s2acc2);
s[0] = vget_lane_u32(as, 0);
s[1] = vget_lane_u32(as, 1);
}
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (len == 1)
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (buf == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (len < 16)
return adler32_len_16(adler, buf, len, sum2);
uint32_t pair[2];
int n = NMAX;
unsigned int done = 0;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
pair[0] = adler;
pair[1] = sum2;
/* If memory is not SIMD aligned, do scalar sums to an aligned
* offset, provided that doing so doesn't completely eliminate
* SIMD operation. Aligned loads are still faster on ARM, even
* though there's no explicit aligned load instruction */
unsigned int align_offset = ((uintptr_t)buf & 15);
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
if (align_offset && len >= (16 + align_adj)) {
NEON_handle_tail(pair, buf, align_adj);
n -= align_adj;
done += align_adj;
} else {
/* If here, we failed the len criteria test, it wouldn't be
* worthwhile to do scalar aligning sums */
align_adj = 0;
}
while (done < len) {
int remaining = (int)(len - done);
n = MIN(remaining, (done == align_adj) ? n : NMAX);
if (n < 16)
break;
NEON_accum32(pair, buf + done, n >> 4);
pair[0] %= BASE;
pair[1] %= BASE;
int actual_nsums = (n >> 4) << 4;
done += actual_nsums;
}
/* Handle the tail elements. */
if (done < len) {
NEON_handle_tail(pair, (buf + done), len - done);
pair[0] %= BASE;
pair[1] %= BASE;
}
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
#endif

@ -0,0 +1,100 @@
#include "../../zbuild.h"
#include "arm_features.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
# include <sys/auxv.h>
# ifdef ARM_ASM_HWCAP
# include <asm/hwcap.h>
# endif
#elif defined(__FreeBSD__) && defined(__aarch64__)
# include <machine/armreg.h>
# ifndef ID_AA64ISAR0_CRC32_VAL
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
# endif
#elif defined(__APPLE__)
# if !defined(_DARWIN_C_SOURCE)
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
# endif
# include <sys/sysctl.h>
#elif defined(_WIN32)
# include <windows.h>
#endif
static int arm_has_crc32() {
#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
# ifdef HWCAP_CRC32
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
# else
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
# endif
#elif defined(__FreeBSD__) && defined(__aarch64__)
return getenv("QEMU_EMULATING") == NULL
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
#elif defined(__APPLE__)
int hascrc32;
size_t size = sizeof(hascrc32);
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
&& hascrc32 == 1;
#elif defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
#elif defined(ARM_NOCHECK_ACLE)
return 1;
#else
return 0;
#endif
}
/* AArch64 has neon. */
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
static inline int arm_has_neon() {
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
# ifdef HWCAP_ARM_NEON
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
# else
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
# endif
#elif defined(__APPLE__)
int hasneon;
size_t size = sizeof(hasneon);
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
&& hasneon == 1;
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
return 1; /* Always supported */
# endif
#endif
#if defined(ARM_NOCHECK_NEON)
return 1;
#else
return 0;
#endif
}
#endif
/* AArch64 does not have ARMv6 SIMD. */
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
static inline int arm_has_simd() {
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
const char *platform = (const char *)getauxval(AT_PLATFORM);
return strncmp(platform, "v6l", 3) == 0
|| strncmp(platform, "v7l", 3) == 0
|| strncmp(platform, "v8l", 3) == 0;
#elif defined(ARM_NOCHECK_SIMD)
return 1;
#else
return 0;
#endif
}
#endif
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
features->has_simd = 0; /* never available */
features->has_neon = 1; /* always available */
#else
features->has_simd = arm_has_simd();
features->has_neon = arm_has_neon();
#endif
features->has_crc32 = arm_has_crc32();
}

@ -0,0 +1,16 @@
/* arm_features.h -- check for ARM features.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_H_
#define ARM_H_
struct arm_cpu_features {
int has_simd;
int has_neon;
int has_crc32;
};
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
#endif /* ARM_H_ */

@ -0,0 +1,99 @@
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../generic/chunk_permute_table.h"
typedef uint8x16_t chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
}
#define CHUNKSIZE chunksize_neon
#define CHUNKCOPY chunkcopy_neon
#define CHUNKUNROLL chunkunroll_neon
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vld1q_u8(s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vst1q_u8(out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
*chunk_rem = lut_rem.remval;
/* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 16 - dist);
/* This version of table is only available on aarch64 */
#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
uint8x16_t ret_vec = vld1q_u8(buf);
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
return vqtbl1q_u8(ret_vec, perm_vec);
#else
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
a = vld1_u8(buf);
b = vld1_u8(buf + 8);
ret0 = vtbl1_u8(a, perm_vec0);
uint8x8x2_t ab = {{a, b}};
ret1 = vtbl2_u8(ab, perm_vec1);
return vcombine_u8(ret0, ret1);
#endif
}
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_neon
#include "inffast_tpl.h"
#endif

@ -0,0 +1,59 @@
/* compare256_neon.c - NEON version of compare256
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
#include "neon_intrins.h"
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint8x16_t a, b, cmp;
uint64_t lane;
a = vld1q_u8(src0);
b = vld1q_u8(src1);
cmp = veorq_u8(a, b);
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
if (lane) {
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
return len + match_byte;
}
len += 8;
src0 += 16, src1 += 16;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
return compare256_neon_static(src0, src1);
}
#define LONGEST_MATCH longest_match_neon
#define COMPARE256 compare256_neon_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_neon
#define COMPARE256 compare256_neon_static
#include "match_tpl.h"
#endif

@ -0,0 +1,78 @@
/* crc32_acle.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
* Copyright (C) 2016 Yang Zhang
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
Z_REGISTER const uint16_t *buf2;
Z_REGISTER const uint32_t *buf4;
Z_REGISTER const uint64_t *buf8;
c = ~crc;
if (UNLIKELY(len == 1)) {
c = __crc32b(c, *buf);
c = ~c;
return c;
}
if ((ptrdiff_t)buf & (sizeof(uint64_t) - 1)) {
if (len && ((ptrdiff_t)buf & 1)) {
c = __crc32b(c, *buf++);
len--;
}
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
buf2 = (const uint16_t *) buf;
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
buf4 = (const uint32_t *) buf2;
} else {
buf4 = (const uint32_t *) buf;
}
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
}
buf8 = (const uint64_t *) buf4;
} else {
buf8 = (const uint64_t *) buf;
}
while (len >= sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
len -= sizeof(uint64_t);
}
if (len >= sizeof(uint32_t)) {
buf4 = (const uint32_t *) buf8;
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
buf2 = (const uint16_t *) buf4;
} else {
buf2 = (const uint16_t *) buf8;
}
if (len >= sizeof(uint16_t)) {
c = __crc32h(c, *buf2++);
len -= sizeof(uint16_t);
}
buf = (const unsigned char *) buf2;
if (len) {
c = __crc32b(c, *buf);
}
c = ~c;
return c;
}
#endif

@ -0,0 +1,24 @@
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#define HASH_CALC(s, h, val) \
h = __crc32w(0, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle
#define INSERT_STRING Z_TARGET_CRC insert_string_acle
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
#include "../../insert_string_tpl.h"
#endif

@ -0,0 +1,58 @@
#ifndef ARM_NEON_INTRINS_H
#define ARM_NEON_INTRINS_H
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
/* arm64_neon.h is MSVC specific */
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
/* Compatibility shim for the _high family of functions */
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
#endif
#ifdef ARM_NEON
#define vqsubq_u16_x4_x1(out, a, b) do { \
out.val[0] = vqsubq_u16(a.val[0], b); \
out.val[1] = vqsubq_u16(a.val[1], b); \
out.val[2] = vqsubq_u16(a.val[2], b); \
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)
# ifndef ARM_NEON_HASLD4
static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
uint16x8x4_t ret = (uint16x8x4_t) {{
vld1q_u16(a),
vld1q_u16(a+8),
vld1q_u16(a+16),
vld1q_u16(a+24)}};
return ret;
}
static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
uint8x16x4_t ret = (uint8x16x4_t) {{
vld1q_u8(a),
vld1q_u8(a+16),
vld1q_u8(a+32),
vld1q_u8(a+48)}};
return ret;
}
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p, a.val[0]);
vst1q_u16(p + 8, a.val[1]);
vst1q_u16(p + 16, a.val[2]);
vst1q_u16(p + 24, a.val[3]);
}
# endif // HASLD4 check
#endif
#endif // include guard ARM_NEON_INTRINS_H

@ -0,0 +1,47 @@
/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
* Copyright (C) 2023 Cameron Cawley
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(ARM_SIMD)
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
Z_REGISTER uint16x2_t v;
uint16x2_t p0, p1, p2, p3;
Z_REGISTER size_t n;
size_t size = entries*sizeof(table[0]);
Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = wsize | (wsize << 16);
n = size / (sizeof(uint16x2_t) * 4);
do {
p0 = *((const uint16x2_t *)(table));
p1 = *((const uint16x2_t *)(table+2));
p2 = *((const uint16x2_t *)(table+4));
p3 = *((const uint16x2_t *)(table+6));
p0 = __uqsub16(p0, v);
p1 = __uqsub16(p1, v);
p2 = __uqsub16(p2, v);
p3 = __uqsub16(p3, v);
*((uint16x2_t *)(table)) = p0;
*((uint16x2_t *)(table+2)) = p1;
*((uint16x2_t *)(table+4)) = p2;
*((uint16x2_t *)(table+6)) = p3;
table += 8;
} while (--n);
}
Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
unsigned int wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif

@ -0,0 +1,46 @@
/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017-2020 Mika T. Lindqvist
*
* Authors:
* Mika T. Lindqvist <postmaster@raasu.org>
* Jun He <jun.he@arm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
Z_REGISTER uint16x8_t v;
uint16x8x4_t p0, p1;
Z_REGISTER size_t n;
size_t size = entries*sizeof(table[0]);
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(wsize);
n = size / (sizeof(uint16x8_t) * 8);
do {
p0 = vld1q_u16_x4(table);
p1 = vld1q_u16_x4(table+32);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
vst1q_u16_x4(table, p0);
vst1q_u16_x4(table+32, p1);
table += 64;
} while (--n);
}
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
unsigned int wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif

@ -0,0 +1,24 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all:
mostlyclean: clean
clean:
rm -f *.o *.lo *~ \
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

@ -0,0 +1,53 @@
/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CHUNK_PERMUTE_TABLE_H_
#define CHUNK_PERMUTE_TABLE_H_
#include "zbuild.h"
/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
static const ALIGNED_(32) uint8_t permute_table[26*32] = {
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
* this is what we're dealt.
*/
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
};
typedef struct lut_rem_pair_s {
uint16_t idx;
uint16_t remval;
} lut_rem_pair;
#endif

@ -0,0 +1,93 @@
# Makefile for POWER-specific files
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
P8FLAGS=-mcpu=power8
P9FLAGS=-mcpu=power9
PPCFLAGS=-maltivec
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: power_features.o \
power_features.lo \
adler32_power8.o \
adler32_power8.lo \
adler32_vmx.o \
adler32_vmx.lo \
chunkset_power8.o \
chunkset_power8.lo \
compare256_power9.o \
compare256_power9.lo \
crc32_power8.o \
crc32_power8.lo \
slide_hash_power8.o \
slide_hash_power8.lo \
slide_hash_vmx.o \
slide_hash_vmx.lo
power_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
power_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
adler32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_vmx.o:
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
adler32_vmx.lo:
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
chunkset_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
chunkset_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
compare256_power9.o:
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
compare256_power9.lo:
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
crc32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
slide_hash_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_vmx.o:
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
slide_hash_vmx.lo:
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

@ -0,0 +1,153 @@
/* Adler32 for POWER8 using VSX instructions.
* Copyright (C) 2020 IBM Corporation
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
* instructions.
*
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
* iteration n) is the initial value of adler - at start _0 is 1 unless
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
* after iteration N.
*
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
* N-1*c[1] + ... + c[N]
*
* In a more general way:
*
* s1_N = s1_0 + sum(i=1 to N)c[i]
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
*
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
* can process N-bit at time we can do this at once.
*
* Since VSX can support 16-bit vector instructions, we can process
* 16-bit at time using N = 16 we have:
*
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
*
* After the first iteration we calculate the adler32 checksum for 16 bytes.
*
* For more background about adler32 please check the RFC:
* https://www.ietf.org/rfc/rfc1950.txt
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "zbuild.h"
#include "adler32_p.h"
/* Vector across sum unsigned int (saturate). */
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
__b = vec_sld(__a, __a, 8);
__b = vec_add(__b, __a);
__a = vec_sld(__b, __b, 4);
__a = vec_add(__a, __b);
return __a;
}
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(s1, buf, s2);
/* If buffer is empty or len=0 we need to return adler initial value. */
if (UNLIKELY(buf == NULL))
return 1;
/* This is faster than VSX code for len < 64. */
if (len < 64)
return adler32_len_64(s1, buf, len, s2);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1};
const vector unsigned char vsh = vec_splat_u8(4);
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
vector unsigned int vs1 = { 0 };
vector unsigned int vs2 = { 0 };
vector unsigned int vs1_save = { 0 };
vector unsigned int vsum1, vsum2;
vector unsigned char vbuf;
int n;
vs1[0] = s1;
vs2[0] = s2;
/* Do length bigger than NMAX in blocks of NMAX size. */
while (len >= NMAX) {
len -= NMAX;
n = NMAX / 16;
do {
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
} while (--n);
/* Once each block of NMAX size. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
vs1[0] = vs1[0] % BASE;
/* vs2[0] = s2_i + 16*s1_save +
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
vs2[0] = vs2[0] % BASE;
vs1 = vec_and(vs1, vmask);
vs2 = vec_and(vs2, vmask);
vs1_save = v_zeros;
}
/* len is less than NMAX one modulo is needed. */
if (len >= 16) {
while (len >= 16) {
len -= 16;
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
}
/* Since the size will be always less than NMAX we do this once. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
}
/* Copy result back to s1, s2 (mod 65521). */
s1 = vs1[0] % BASE;
s2 = vs2[0] % BASE;
/* Process tail (len < 16). */
return adler32_len_16(s1, buf, len, s2);
}
#endif /* POWER8_VSX */

@ -0,0 +1,186 @@
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#include <altivec.h>
#include "zbuild.h"
#include "zendian.h"
#include "adler32_p.h"
#define vmx_zero() (vec_splat_u32(0))
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
* a 2 element vector from a single load + a subsequent shift is just barely faster
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
vector unsigned int adacc, s2acc;
vector unsigned int pair_vec = vec_ld(0, s);
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
#if BYTE_ORDER == LITTLE_ENDIAN
s2acc = vec_sro(pair_vec, shift_vec);
#else
s2acc = vec_slo(pair_vec, shift_vec);
#endif
vector unsigned int zero = vmx_zero();
vector unsigned int s3acc = zero;
vector unsigned int s3acc_0 = zero;
vector unsigned int adacc_prev = adacc;
vector unsigned int adacc_prev_0 = zero;
vector unsigned int s2acc_0 = zero;
vector unsigned int s2acc_1 = zero;
vector unsigned int s2acc_2 = zero;
/* Maintain a running sum of a second half, this might help use break yet another
* data dependency bubble in the sum */
vector unsigned int adacc_0 = zero;
int num_iter = len / 4;
int rem = len & 3;
for (int i = 0; i < num_iter; ++i) {
vector unsigned char d0 = vec_ld(0, buf);
vector unsigned char d1 = vec_ld(16, buf);
vector unsigned char d2 = vec_ld(32, buf);
vector unsigned char d3 = vec_ld(48, buf);
/* The core operation of the loop, basically
* what is being unrolled below */
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
s2acc = vec_msum(t0, d0, s2acc);
/* interleave dependent sums in here */
adacc_0 = vec_sum4s(d1, adacc_0);
s2acc_0 = vec_msum(t1, d1, s2acc_0);
adacc = vec_sum4s(d2, adacc);
s2acc_1 = vec_msum(t2, d2, s2acc_1);
s2acc_2 = vec_msum(t3, d3, s2acc_2);
adacc_0 = vec_sum4s(d3, adacc_0);
adacc_prev = adacc;
adacc_prev_0 = adacc_0;
buf += 64;
}
adacc = vec_add(adacc, adacc_0);
s3acc = vec_add(s3acc, s3acc_0);
s3acc = vec_sl(s3acc, vec_splat_u32(6));
if (rem) {
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
while (rem--) {
vector unsigned char d0 = vec_ld(0, buf);
adacc = vec_sum4s(d0, adacc);
s3acc = vec_add(s3acc, adacc_prev);
s2acc = vec_msum(t3, d0, s2acc);
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
buf += 16;
}
}
/* Sum up independent second sums */
s2acc = vec_add(s2acc, s2acc_0);
s2acc_2 = vec_add(s2acc_1, s2acc_2);
s2acc = vec_add(s2acc, s2acc_2);
s2acc = vec_add(s2acc, s3acc);
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
vec_ste(adacc, 0, s);
vec_ste(s2acc, 0, s+1);
}
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
int n = NMAX;
unsigned int done = 0, i;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
pair[0] = adler;
pair[1] = sum2;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
// Align buffer
unsigned int al = 0;
if ((uintptr_t)buf & 0xf) {
al = 16-((uintptr_t)buf & 0xf);
if (al > len) {
al=len;
}
vmx_handle_head_or_tail(pair, buf, al);
done += al;
/* Rather than rebasing, we can reduce the max sums for the
* first round only */
n -= al;
}
for (i = al; i < len; i += n) {
int remaining = (int)(len-i);
n = MIN(remaining, (i == al) ? n : NMAX);
if (n < 16)
break;
vmx_accum32(pair, buf + i, n / 16);
pair[0] %= BASE;
pair[1] %= BASE;
done += (n / 16) * 16;
}
/* Handle the tail elements. */
if (done < len) {
vmx_handle_head_or_tail(pair, (buf + done), len - done);
pair[0] %= BASE;
pair[1] %= BASE;
}
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
#endif

@ -0,0 +1,55 @@
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#include <altivec.h>
#include "../../zbuild.h"
typedef vector unsigned char chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
uint16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
uint64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = (vector unsigned char)vec_splats((unsigned long long)tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vec_xl(0, s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vec_xst(*chunk, 0, out);
}
#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_power8
#include "inffast_tpl.h"
#endif

@ -0,0 +1,64 @@
/* compare256_power9.c - Power9 version of compare256
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER9
#include <altivec.h>
#include "../../zbuild.h"
#include "../../zendian.h"
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
#if BYTE_ORDER == LITTLE_ENDIAN
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
#else
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
#endif
#else
# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
#endif
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0, cmplen;
do {
vector unsigned char vsrc0, vsrc1, vc;
vsrc0 = *((vector unsigned char *)src0);
vsrc1 = *((vector unsigned char *)src1);
/* Compare 16 bytes at a time. Each byte of vc will be either
* all ones or all zeroes, depending on the result of the comparison. */
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
/* Since the index of matching bytes will contain only zeroes
* on vc (since we used cmpne), counting the number of consecutive
* bytes where LSB == 0 is the same as counting the length of the match. */
zng_vec_vctzlsbb(vc, cmplen);
if (cmplen != 16)
return len + cmplen;
src0 += 16, src1 += 16, len += 16;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
return compare256_power9_static(src0, src1);
}
#define LONGEST_MATCH longest_match_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_power9
#define COMPARE256 compare256_power9_static
#include "match_tpl.h"
#endif

File diff suppressed because it is too large Load Diff

@ -0,0 +1,589 @@
/* crc32 for POWER8 using VSX instructions
* Copyright (C) 2021 IBM Corporation
*
* Author: Rogerio Alves <rogealve@br.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
*
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
* chunks in order to mask the latency of the vpmsum instructions. If we
* have more than 32 kB of data to checksum we repeat this step multiple
* times, passing in the previous 1024 bits.
*
* The next step is to reduce the 1024 bits to 64 bits. This step adds
* 32 bits of 0s to the end - this matches what a CRC does. We just
* calculate constants that land the data in this 32 bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* This code uses gcc vector builtins instead using assembly directly.
*/
#include <altivec.h>
#include "zendian.h"
#include "zbuild.h"
#include "crc32_constants.h"
#include "crc32_braid_tbl.h"
#if defined (__clang__)
#include "fallback_builtins.h"
#endif
#define MAX_SIZE 32768
#define VMX_ALIGN 16
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
while (len--)
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
return crc;
}
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
unsigned int prealign;
unsigned int tail;
unsigned long len = (unsigned long) _len;
if (p == (const unsigned char *) 0x0)
return 0;
crc ^= 0xffffffff;
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
crc = crc32_align(crc, p, len);
goto out;
}
if ((unsigned long)p & VMX_ALIGN_MASK) {
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
crc = crc32_align(crc, p, prealign);
len -= prealign;
p += prealign;
}
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
tail = len & VMX_ALIGN_MASK;
if (tail) {
p += len & ~VMX_ALIGN_MASK;
crc = crc32_align(crc, p, tail);
}
out:
crc ^= 0xffffffff;
return crc;
}
/* When we have a load-store in a single-dispatch group and address overlap
* such that forward is not allowed (load-hit-store) the group must be flushed.
* A group ending NOP prevents the flush.
*/
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
#if BYTE_ORDER == BIG_ENDIAN
#define BYTESWAP_DATA
#endif
#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
#if BYTE_ORDER == LITTLE_ENDIAN
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
const __vector unsigned long long vmask_64bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
__vector unsigned long long vcrc;
__vector unsigned long long vconst1, vconst2;
/* vdata0-vdata7 will contain our data (p). */
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
/* v0-v7 will contain our checksums */
__vector unsigned long long v0 = {0,0};
__vector unsigned long long v1 = {0,0};
__vector unsigned long long v2 = {0,0};
__vector unsigned long long v3 = {0,0};
__vector unsigned long long v4 = {0,0};
__vector unsigned long long v5 = {0,0};
__vector unsigned long long v6 = {0,0};
__vector unsigned long long v7 = {0,0};
/* Vector auxiliary variables. */
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
unsigned int offset; /* Constant table offset. */
unsigned long i; /* Counter. */
unsigned long chunks;
unsigned long block_size;
int next_block = 0;
/* Align by 128 bits. The last 128 bit block will be processed at end. */
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
/* Short version. */
if (len < 256) {
/* Calculate where in the constant table we need to start. */
offset = 256 - len;
vconst1 = vec_ld(offset, vcrc_short_const);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
/* xor initial value */
vdata0 = vec_xor(vdata0, vcrc);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
for (i = 16; i < len; i += 16) {
vconst1 = vec_ld(offset + i, vcrc_short_const);
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
}
} else {
/* Load initial values. */
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
/* xor in initial value */
vdata0 = vec_xor(vdata0, vcrc);
p = (char *)p + 128;
do {
/* Checksum in blocks of MAX_SIZE. */
block_size = length;
if (block_size > MAX_SIZE) {
block_size = MAX_SIZE;
}
length = length - block_size;
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
offset = (MAX_SIZE/8) - (block_size/8);
/* We reduce our final 128 bytes in a separate step */
chunks = (block_size/128)-1;
vconst1 = vec_ld(offset, vcrc_const);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
if (chunks > 1) {
offset += 16;
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
/*
* main loop. Each iteration calculates the CRC for a 128-byte
* block.
*/
for (i = 0; i < chunks-2; i++) {
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
GROUP_ENDING_NOP;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst2);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst2);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
vdata2, (__vector unsigned long long)vconst2);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst2);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
}
/* First cool down */
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
}/* else */
/* Second cool down. */
v0 = vec_xor(v0, va0);
v1 = vec_xor(v1, va1);
v2 = vec_xor(v2, va2);
v3 = vec_xor(v3, va3);
v4 = vec_xor(v4, va4);
v5 = vec_xor(v5, va5);
v6 = vec_xor(v6, va6);
v7 = vec_xor(v7, va7);
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
(__vector unsigned char)vzero, 4);
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
(__vector unsigned char)vzero, 4);
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
(__vector unsigned char)vzero, 4);
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
(__vector unsigned char)vzero, 4);
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
(__vector unsigned char)vzero, 4);
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
(__vector unsigned char)vzero, 4);
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
(__vector unsigned char)vzero, 4);
/* xor with the last 1024 bits. */
va0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(va0, va0, va0, vperm_const);
va1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(va1, va1, va1, vperm_const);
va2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(va2, va2, va2, vperm_const);
va3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(va3, va3, va3, vperm_const);
va4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(va4, va4, va4, vperm_const);
va5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(va5, va5, va5, vperm_const);
va6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(va6, va6, va6, vperm_const);
va7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(va7, va7, va7, vperm_const);
p = (char *)p + 128;
vdata0 = vec_xor(v0, va0);
vdata1 = vec_xor(v1, va1);
vdata2 = vec_xor(v2, va2);
vdata3 = vec_xor(v3, va3);
vdata4 = vec_xor(v4, va4);
vdata5 = vec_xor(v5, va5);
vdata6 = vec_xor(v6, va6);
vdata7 = vec_xor(v7, va7);
/* Check if we have more blocks to process */
next_block = 0;
if (length != 0) {
next_block = 1;
/* zero v0-v7 */
v0 = vec_xor(v0, v0);
v1 = vec_xor(v1, v1);
v2 = vec_xor(v2, v2);
v3 = vec_xor(v3, v3);
v4 = vec_xor(v4, v4);
v5 = vec_xor(v5, v5);
v6 = vec_xor(v6, v6);
v7 = vec_xor(v7, v7);
}
length = length + 128;
} while (next_block);
/* Calculate how many bytes we have left. */
length = (len & 127);
/* Calculate where in (short) constant table we need to start. */
offset = 128 - length;
v0 = vec_ld(offset, vcrc_short_const);
v1 = vec_ld(offset + 16, vcrc_short_const);
v2 = vec_ld(offset + 32, vcrc_short_const);
v3 = vec_ld(offset + 48, vcrc_short_const);
v4 = vec_ld(offset + 64, vcrc_short_const);
v5 = vec_ld(offset + 80, vcrc_short_const);
v6 = vec_ld(offset + 96, vcrc_short_const);
v7 = vec_ld(offset + 112, vcrc_short_const);
offset += 128;
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
/* Now reduce the tail (0-112 bytes). */
for (i = 0; i < length; i+=16) {
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
va0 = vec_ld(offset + i,vcrc_short_const);
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
v0 = vec_xor(v0, va0);
}
/* xor all parallel chunks together. */
v0 = vec_xor(v0, v1);
v2 = vec_xor(v2, v3);
v4 = vec_xor(v4, v5);
v6 = vec_xor(v6, v7);
v0 = vec_xor(v0, v2);
v4 = vec_xor(v4, v6);
v0 = vec_xor(v0, v4);
}
/* Barrett Reduction */
vconst1 = vec_ld(0, v_Barrett_const);
vconst2 = vec_ld(16, v_Barrett_const);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)v0, 8);
v0 = vec_xor(v1,v0);
/* shift left one bit */
__vector unsigned char vsht_splat = vec_splat_u8 (1);
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
v0 = vec_and(v0, vmask_64bit);
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
/* bottom 32 bits of a */
v1 = vec_and(v0, vmask_32bit);
/* ma */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst1);
/* bottom 32bits of ma */
v1 = vec_and(v1, vmask_32bit);
/* qn */
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
/* shift result into top 64 bits of */
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
#if BYTE_ORDER == BIG_ENDIAN
return v0[0];
#else
return v0[1];
#endif
}

@ -0,0 +1,31 @@
/* Helper functions to work around issues with clang builtins
* Copyright (C) 2021 IBM Corporation
*
* Authors:
* Daniel Black <daniel@linux.vnet.ibm.com>
* Rogerio Alves <rogealve@br.ibm.com>
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_BUILTINS_H
#define POWER_BUILTINS_H
/*
* These stubs fix clang incompatibilities with GCC builtins.
*/
#ifndef __builtin_crypto_vpmsumw
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
#endif
#ifndef __builtin_crypto_vpmsumd
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
#endif
static inline __vector unsigned long long __attribute__((overloadable))
vec_ld(int __a, const __vector unsigned long long* __b) {
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
}
#endif

@ -0,0 +1,46 @@
/* power_features.c - POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "../../zbuild.h"
#include "power_features.h"
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
#ifdef PPC_FEATURES
unsigned long hwcap;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
hwcap = getauxval(AT_HWCAP);
#endif
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
features->has_altivec = 1;
#endif
#ifdef POWER_FEATURES
unsigned long hwcap2;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
#else
hwcap2 = getauxval(AT_HWCAP2);
#endif
#ifdef POWER8_VSX
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
features->has_arch_2_07 = 1;
#endif
#ifdef POWER9
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
features->has_arch_3_00 = 1;
#endif
#endif
}

@ -0,0 +1,18 @@
/* power_features.h -- check for POWER CPU features
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
struct power_cpu_features {
int has_altivec;
int has_arch_2_07;
int has_arch_3_00;
};
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
#endif /* POWER_H_ */

@ -0,0 +1,12 @@
/* Optimized slide_hash for POWER processors
* Copyright (C) 2019-2020 IBM Corporation
* Author: Matheus Castanho <msc@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX
#define SLIDE_PPC slide_hash_power8
#include "slide_ppc_tpl.h"
#endif /* POWER8_VSX */

@ -0,0 +1,10 @@
/* Optimized slide_hash for PowerPC processors with VMX instructions
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef PPC_VMX
#define SLIDE_PPC slide_hash_vmx
#include "slide_ppc_tpl.h"
#endif /* PPC_VMX */

@ -0,0 +1,31 @@
/* Optimized slide_hash for PowerPC processors
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <altivec.h>
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
const vector unsigned short vmx_wsize = vec_splats(wsize);
Pos *p = table;
do {
vector unsigned short value, result;
value = vec_ld(0, p);
result = vec_subs(value, vmx_wsize);
vec_st(result, 0, p);
p += 8;
entries -= 8;
} while (entries > 0);
}
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
uint16_t wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}

@ -0,0 +1,45 @@
# Building RISC-V Target with Cmake #
> **Warning**
> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
>
> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
```bash
./prepare_riscv_toolchain_qemu.sh
```
After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
## Cross-Compile for RISC-V Target ##
```bash
cmake -G Ninja -B ./build-riscv \
-D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
-D CMAKE_INSTALL_PREFIX=./build-riscv/install \
-D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
-D QEMU_PATH={QEMU_PATH} \
.
cmake --build ./build-riscv
```
Disable the option if there is no RVV support:
```
-D WITH_RVV=OFF
```
## Run Unittests on User Mode QEMU ##
```bash
cd ./build-riscv && ctest --verbose
```

@ -0,0 +1,132 @@
/* adler32_rvv.c - RVV version of adler32
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef RISCV_RVV
#include <riscv_vector.h>
#include <stdint.h>
#include "../../zbuild.h"
#include "../../adler32_p.h"
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (len == 1) {
if (COPY) memcpy(dst, src, 1);
return adler32_len_1(adler, src, sum2);
}
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (src == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (len < 16) {
if (COPY) memcpy(dst, src, len);
return adler32_len_16(adler, src, len, sum2);
}
size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
vl = vl > 256 ? 256 : vl;
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint16m2_t v_buf16_accu;
/*
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
* accumulators to boost performance.
*
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
* vl > 256 (255 * 256 <= UINT16_MAX).
*
* We accumulate 8-bit data into a 16-bit accumulator and then
* move the data into the 32-bit accumulator at the last iteration.
*/
size_t block_size = (256 / vl) * vl;
size_t nmax_limit = (NMAX / block_size);
size_t cnt = 0;
while (left >= block_size) {
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t subprob = block_size;
while (subprob > 0) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
src += vl;
if (COPY) dst += vl;
subprob -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
left -= block_size;
/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
cnt = 0;
}
}
/* the left len <= 256 now, we can use 16-bit accum safely */
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t res = left;
while (left >= vl) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
src += vl;
if (COPY) dst += vl;
left -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
sum2 += (sum2_sum + adler * (len - left));
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
adler += adler_sum;
while (left--) {
if (COPY) *dst++ = *src;
adler += *src++;
sum2 += adler;
}
sum2 %= BASE;
adler %= BASE;
return adler | (sum2 << 16);
}
Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
return adler32_rvv_impl(adler, dst, src, len, 1);
}
Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
return adler32_rvv_impl(adler, NULL, buf, len, 0);
}
#endif // RISCV_RVV

@ -0,0 +1,121 @@
/* chunkset_rvv.c - RVV version of chunkset
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <riscv_vector.h>
#include "zbuild.h"
/*
* RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
* so we prefer using large size chunk and copy memory as much as possible.
*/
#define CHUNK_SIZE 32
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define CHUNK_MEMSET_RVV_IMPL(elen) \
do { \
size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t); \
uint##elen##_t val = *(uint##elen##_t*)from; \
uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
do { \
vl = __riscv_vsetvl_e##elen##m4(len); \
vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
__riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
len -= vl; chunk_p += vl; \
} while (len > 0); \
} while (0)
/* We don't have a 32-byte datatype for RISC-V arch. */
typedef struct chunk_s {
uint64_t data[4];
} chunk_t;
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
CHUNK_MEMSET_RVV_IMPL(16);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
CHUNK_MEMSET_RVV_IMPL(32);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
CHUNK_MEMSET_RVV_IMPL(64);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
memcpy(out, chunk->data, CHUNK_SIZE);
}
#define CHUNKSIZE chunksize_rvv
#define CHUNKCOPY chunkcopy_rvv
#define CHUNKUNROLL chunkunroll_rvv
#define CHUNKMEMSET chunkmemset_rvv
#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
#define HAVE_CHUNKCOPY
/*
* Assuming that the length is non-zero, and that `from` lags `out` by at least
* sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
*
* We load/store a single chunk once in the `CHUNKCOPY`.
* However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
* such that, we prefer copy large memory size once to make good use of the the RVV advance.
*
* To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
* but we still copy as much memory as possible for some conditions.
*
* case 1: out - from >= len (no overlap)
* We can use memcpy to copy `len` size once
* because the memory layout would be the same.
*
* case 2: overlap
* We copy N chunks using memcpy at once, aiming to achieve our goal:
* to copy as much memory as possible.
*
* After using a single memcpy to copy N chunks, we have to use series of
* loadchunk and storechunk to ensure the result is correct.
*/
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
memcpy(out, from, sizeof(chunk_t));
out += align;
from += align;
len -= align;
ptrdiff_t dist = out - from;
if (dist >= len) {
memcpy(out, from, len);
out += len;
from += len;
return out;
}
if (dist >= sizeof(chunk_t)) {
dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
memcpy(out, from, dist);
out += dist;
from += dist;
len -= dist;
}
while (len > 0) {
memcpy(out, from, sizeof(chunk_t));
out += sizeof(chunk_t);
from += sizeof(chunk_t);
len -= sizeof(chunk_t);
}
return out;
}
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_rvv
#include "inffast_tpl.h"

@ -0,0 +1,47 @@
/* compare256_rvv.c - RVV version of compare256
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef RISCV_RVV
#include "../../zbuild.h"
#include "fallback_builtins.h"
#include <riscv_vector.h>
static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
size_t vl;
long found_diff;
do {
vl = __riscv_vsetvl_e8m4(256 - len);
vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
found_diff = __riscv_vfirst_m_b2(v_mask, vl);
if (found_diff >= 0)
return len + (uint32_t)found_diff;
src0 += vl, src1 += vl, len += vl;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
return compare256_rvv_static(src0, src1);
}
#define LONGEST_MATCH longest_match_rvv
#define COMPARE256 compare256_rvv_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_rvv
#define COMPARE256 compare256_rvv_static
#include "match_tpl.h"
#endif // RISCV_RVV

@ -0,0 +1,45 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/auxv.h>
#include <sys/utsname.h>
#include "../../zbuild.h"
#include "riscv_features.h"
#define ISA_V_HWCAP (1 << ('v' - 'a'))
int Z_INTERNAL is_kernel_version_greater_or_equal_to_6_5() {
struct utsname buffer;
uname(&buffer);
int major, minor;
if (sscanf(buffer.release, "%d.%d", &major, &minor) != 2) {
// Something bad with uname()
return 0;
}
if (major > 6 || major == 6 && minor >= 5)
return 1;
return 0;
}
void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *features) {
#if defined(__riscv_v) && defined(__linux__)
features->has_rvv = 1;
#else
features->has_rvv = 0;
#endif
}
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
unsigned long hw_cap = getauxval(AT_HWCAP);
features->has_rvv = hw_cap & ISA_V_HWCAP;
}
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
if (is_kernel_version_greater_or_equal_to_6_5())
riscv_check_features_runtime(features);
else
riscv_check_features_compile_time(features);
}

@ -0,0 +1,18 @@
/* riscv_features.h -- check for riscv features.
*
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_H_
#define RISCV_H_
struct riscv_cpu_features {
int has_rvv;
};
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
#endif /* RISCV_H_ */

@ -0,0 +1,34 @@
/* slide_hash_rvv.c - RVV version of slide_hash
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef RISCV_RVV
#include <riscv_vector.h>
#include "../../zbuild.h"
#include "../../deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
size_t vl;
while (entries > 0) {
vl = __riscv_vsetvl_e16m4(entries);
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
__riscv_vse16_v_u16m4(table, v_tab, vl);
table += vl, entries -= vl;
}
}
Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
uint16_t wsize = (uint16_t)s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif // RISCV_RVV

@ -0,0 +1,147 @@
# Makefile for zlib
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
AVX512VNNIFLAG=-mavx512vnni
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
XSAVEFLAG=-mxsave
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
x86_features.o x86_features.lo \
adler32_avx2.o adler32_avx2.lo \
adler32_avx512.o adler32_avx512.lo \
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx2.o chunkset_avx2.lo \
chunkset_sse2.o chunkset_sse2.lo \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
slide_hash_sse2.o slide_hash_sse2.lo
x86_features.o:
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
x86_features.lo:
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
chunkset_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
chunkset_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
chunkset_ssse3.o:
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
chunkset_ssse3.lo:
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
compare256_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
compare256_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_vpclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
crc32_vpclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_avx2.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
slide_hash_sse2.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
slide_hash_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

@ -0,0 +1,154 @@
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Copyright (C) 2022 Adam Stylinski
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX2
#include "../../zbuild.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "../../adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
#endif
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
if (COPY) {
return adler32_copy_len_16(adler0, src, dst, len, adler1);
} else {
return adler32_len_16(adler0, src, len, adler1);
}
} else if (len < 32) {
if (COPY) {
return copy_sub32(adler, dst, src, len);
} else {
return sub32(adler, src, len);
}
}
__m256i vs1, vs2;
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m256i dot3v = _mm256_set1_epi16(1);
const __m256i zero = _mm256_setzero_si256();
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
*/
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
if (COPY) {
_mm256_storeu_si256((__m256i*)dst, vbuf);
dst += 32;
}
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
vs2 = _mm256_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
/* Defer the multiplication with 32 to outside of the loop */
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
/* The compiler is generating the following sequence for this integer modulus
* when done the scalar way, in GPRs:
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
...
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
shr $0x2f,%rsi // shift right by 47
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
...
// repeats for each element with vpextract instructions
This is tricky with AVX2 for a number of reasons:
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
back down to 32 bit precision later (there is in AVX512)
3.) Full width integer multiplications aren't cheap
We can, however, do a relatively cheap sequence for horizontal sums.
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
performed on the maximum possible inputs before overflow
*/
/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
* what the compiler is doing to avoid integer divisions. */
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
}
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, dst, src, len, 1);
}
#endif

@ -0,0 +1,32 @@
/* adler32_avx2_p.h -- adler32 avx2 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_AVX2_P_H_
#define ADLER32_AVX2_P_H_
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
static inline uint32_t hsum256(__m256i x) {
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
_mm256_castsi256_si128(x));
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
static inline uint32_t partial_hsum256(__m256i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros */
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif
#endif

@ -0,0 +1,115 @@
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "../../cpu_features.h"
#include <immintrin.h>
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 64) {
/* This handles the remaining copies, just call normal adler checksum after this */
if (COPY) {
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
}
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
__m512i vbuf, vs1_0, vs3;
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i dot3v = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
size_t k;
while (len >= 64) {
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
vs1_0 = vs1;
vs3 = _mm512_setzero_si512();
k = MIN(len, NMAX);
k -= k % 64;
len -= k;
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf = _mm512_loadu_si512(src);
if (COPY) {
_mm512_storeu_si512(dst, vbuf);
dst += 64;
}
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
vs1 = _mm512_add_epi32(vs1_sad, vs1);
vs3 = _mm512_add_epi32(vs3, vs1_0);
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm512_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, dst, src, len, 1);
}
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
}
#endif

@ -0,0 +1,46 @@
#ifndef AVX512_FUNCS_H
#define AVX512_FUNCS_H
#include <immintrin.h>
#include <stdint.h>
/* Written because *_add_epi32(a) sets off ubsan */
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
__m256i a = _mm512_extracti64x4_epi64(x, 1);
__m256i b = _mm512_extracti64x4_epi64(x, 0);
__m256i a_plus_b = _mm256_add_epi32(a, b);
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
__m128i c_plus_d = _mm_add_epi32(c, d);
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
static inline uint32_t partial_hsum(__m512i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros. Marking this const so the compiler stands
* a better chance of keeping this resident in a register through entire
* loop execution. We certainly have enough zmm registers (32) */
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
1, 1, 1, 1, 1, 1, 1, 1);
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
/* From here, it's a simple 256 bit wide reduction sum */
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
* pretty slow, much slower than the longer instruction sequence below */
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
_mm256_castsi256_si128(non_zero_avx));
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
#endif

@ -0,0 +1,225 @@
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
* Based on Brian Bockelman's AVX2 version
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_AVX512VNNI
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../cpu_features.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 32)
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
if (len < 64)
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i zero = _mm512_setzero_si512();
__m512i vs1, vs2;
while (len >= 64) {
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
__m512i vs3 = _mm512_setzero_si512();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m512i vs2_1 = _mm512_setzero_si512();
__m512i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 128) {
vbuf1 = _mm512_loadu_si512((__m512i*)src);
src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 128) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm512_loadu_si512((__m512i*)src);
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
src += 128;
k -= 128;
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs3 = _mm512_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm512_add_epi32(vs3, vs1);
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = _mm512_add_epi32(vs1, vs1_sad);
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm512_slli_epi32(vs3, 6);
vs2 = _mm512_add_epi32(vs2, vs3);
vs2 = _mm512_add_epi32(vs2, vs2_1);
adler0 = partial_hsum(vs1) % BASE;
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel;
}
return adler;
}
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel_copy:
if (len < 32) {
/* This handles the remaining copies, just call normal adler checksum after this */
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
const __m256i zero = _mm256_setzero_si256();
__m256i vs1, vs2;
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
__m256i vs2_1 = _mm256_setzero_si256();
__m256i vbuf0, vbuf1;
/* Remainder peeling */
if (k % 64) {
vbuf1 = _mm256_loadu_si256((__m256i*)src);
_mm256_storeu_si256((__m256i*)dst, vbuf1);
dst += 32;
src += 32;
k -= 32;
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
vs1_0 = vs1;
}
/* Manually unrolled this loop by 2 for an decent amount of ILP */
while (k >= 64) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
vbuf0 = _mm256_loadu_si256((__m256i*)src);
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
_mm256_storeu_si256((__m256i*)dst, vbuf0);
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
dst += 64;
src += 64;
k -= 64;
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs3 = _mm256_add_epi32(vs3, vs1_0);
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
* instructions to eliminate them */
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
vs3 = _mm256_add_epi32(vs3, vs1);
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
vs1 = _mm256_add_epi32(vs1, vs1_sad);
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
vs1_0 = vs1;
}
vs3 = _mm256_slli_epi32(vs3, 5);
vs2 = _mm256_add_epi32(vs2, vs3);
vs2 = _mm256_add_epi32(vs2, vs2_1);
adler0 = partial_hsum256(vs1) % BASE;
adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
/* Process tail (len < 64). */
if (len) {
goto rem_peel_copy;
}
return adler;
}
#endif

@ -0,0 +1,121 @@
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "adler32_ssse3_p.h"
#include <immintrin.h>
#ifdef X86_SSE42
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
return adler32_copy_len_16(adler0, src, dst, len, adler1);
}
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
v_sad_sum2, vsum2, vsum2_0;
__m128i zero = _mm_setzero_si128();
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
size_t k;
while (len >= 16) {
k = MIN(len, NMAX);
k -= k % 16;
len -= k;
vs1 = _mm_cvtsi32_si128(adler0);
vs2 = _mm_cvtsi32_si128(adler1);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
src += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
_mm_storeu_si128((__m128i*)dst, vbuf);
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
dst += 32;
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
vs2 = _mm_add_epi32(vsum2, vs2);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_loadu_si128((__m128i*)src);
src += 16;
k -= 16;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
_mm_storeu_si128((__m128i*)dst, vbuf);
dst += 16;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
adler0 = partial_hsum(vs1) % BASE;
adler1 = hsum(vs2) % BASE;
}
/* If this is true, there's fewer than 16 elements remaining */
if (len) {
goto rem_peel;
}
return adler0 | (adler1 << 16);
}
#endif

@ -0,0 +1,156 @@
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3
#include <immintrin.h>
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
/* If our buffer is unaligned (likely), make the determination whether
* or not there's enough of a buffer to consume to make the scalar, aligning
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
* load. This is a pretty simple test, just test if 16 - the remainder + len is
* < 16 */
size_t max_iters = NMAX;
size_t rem = (uintptr_t)buf & 15;
size_t align_offset = 16 - rem;
size_t k = 0;
if (rem) {
if (len < 16 + align_offset) {
/* Let's eat the cost of this one unaligned load so that
* we don't completely skip over the vectorization. Doing
* 16 bytes at a time unaligned is better than 16 + <= 15
* sums */
vbuf = _mm_loadu_si128((__m128i*)buf);
len -= 16;
buf += 16;
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs1_0 = vs1;
goto unaligned_jmp;
}
for (size_t i = 0; i < align_offset; ++i) {
adler += *(buf++);
sum2 += adler;
}
/* lop off the max number of sums based on the scalar sums done
* above */
len -= align_offset;
max_iters -= align_offset;
}
while (len >= 16) {
vs1 = _mm_cvtsi32_si128(adler);
vs2 = _mm_cvtsi32_si128(sum2);
vs3 = _mm_setzero_si128();
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
k = (len < max_iters ? len : max_iters);
k -= k % 16;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
buf += 32;
k -= 32;
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
vs2 = _mm_add_epi32(vsum2, vs2);
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
vs1_0 = vs1;
}
vs2 = _mm_add_epi32(vs2_0, vs2);
vs3 = _mm_slli_epi32(vs3, 5);
vs2 = _mm_add_epi32(vs3, vs2);
vs3 = _mm_setzero_si128();
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
vbuf = _mm_load_si128((__m128i*)buf);
buf += 16;
k -= 16;
unaligned_jmp:
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
vs3 = _mm_add_epi32(vs1_0, vs3);
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs2 = _mm_add_epi32(vsum2, vs2);
vs1_0 = vs1;
}
vs3 = _mm_slli_epi32(vs3, 4);
vs2 = _mm_add_epi32(vs2, vs3);
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
* a partial reduction sum implicitly and only summing to integers in vector positions
* 0 and 2. This saves us some contention on the shuffle port(s) */
adler = partial_hsum(vs1) % BASE;
sum2 = hsum(vs2) % BASE;
max_iters = NMAX;
}
/* Process tail (len < 16). */
return adler32_len_16(adler, buf, len, sum2);
}
#endif

@ -0,0 +1,29 @@
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_SSSE3_P_H_
#define ADLER32_SSSE3_P_H_
#ifdef X86_SSSE3
#include <immintrin.h>
#include <stdint.h>
static inline uint32_t partial_hsum(__m128i x) {
__m128i second_int = _mm_srli_si128(x, 8);
__m128i sum = _mm_add_epi32(x, second_int);
return _mm_cvtsi128_si32(sum);
}
static inline uint32_t hsum(__m128i x) {
__m128i sum1 = _mm_unpackhi_epi64(x, x);
__m128i sum2 = _mm_add_epi32(x, sum1);
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
__m128i sum4 = _mm_add_epi32(sum2, sum3);
return _mm_cvtsi128_si32(sum4);
}
#endif
#endif

@ -0,0 +1,133 @@
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#ifdef X86_AVX2
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
typedef __m256i chunk_t;
#define CHUNK_SIZE 32
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
static const lut_rem_pair perm_idx_lut[29] = {
{ 0, 2}, /* 3 */
{ 0, 0}, /* don't care */
{ 1 * 32, 2}, /* 5 */
{ 2 * 32, 2}, /* 6 */
{ 3 * 32, 4}, /* 7 */
{ 0 * 32, 0}, /* don't care */
{ 4 * 32, 5}, /* 9 */
{ 5 * 32, 22}, /* 10 */
{ 6 * 32, 21}, /* 11 */
{ 7 * 32, 20}, /* 12 */
{ 8 * 32, 6}, /* 13 */
{ 9 * 32, 4}, /* 14 */
{10 * 32, 2}, /* 15 */
{ 0 * 32, 0}, /* don't care */
{11 * 32, 15}, /* 17 */
{11 * 32 + 16, 14}, /* 18 */
{11 * 32 + 16 * 2, 13}, /* 19 */
{11 * 32 + 16 * 3, 12}, /* 20 */
{11 * 32 + 16 * 4, 11}, /* 21 */
{11 * 32 + 16 * 5, 10}, /* 22 */
{11 * 32 + 16 * 6, 9}, /* 23 */
{11 * 32 + 16 * 7, 8}, /* 24 */
{11 * 32 + 16 * 8, 7}, /* 25 */
{11 * 32 + 16 * 9, 6}, /* 26 */
{11 * 32 + 16 * 10, 5}, /* 27 */
{11 * 32 + 16 * 11, 4}, /* 28 */
{11 * 32 + 16 * 12, 3}, /* 29 */
{11 * 32 + 16 * 13, 2}, /* 30 */
{11 * 32 + 16 * 14, 1} /* 31 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm256_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
*chunk_rem = lut_rem.remval;
/* See note in chunkset_ssse3.c for why this is ok */
__msan_unpoison(buf + dist, 32 - dist);
if (dist < 16) {
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
* shuffles and combining the halves later */
const __m256i permute_xform =
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
} else if (dist == 16) {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
} else {
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
* shuffle those values */
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
}
return ret_vec;
}
#define CHUNKSIZE chunksize_avx2
#define CHUNKCOPY chunkcopy_avx2
#define CHUNKUNROLL chunkunroll_avx2
#define CHUNKMEMSET chunkmemset_avx2
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_avx2
#include "inffast_tpl.h"
#endif

@ -0,0 +1,56 @@
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
#define CHUNKSIZE chunksize_sse2
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKMEMSET chunkmemset_sse2
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_sse2
#include "inffast_tpl.h"
#endif

@ -0,0 +1,101 @@
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
#if defined(X86_SSSE3) && defined(X86_SSE2)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
#define HAVE_CHUNKCOPY
#define HAVE_CHUNKUNROLL
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
{0, 0}, /* don't care */
{1 * 32, 1}, /* 5 */
{2 * 32, 4}, /* 6 */
{3 * 32, 2}, /* 7 */
{0 * 32, 0}, /* don't care */
{4 * 32, 7}, /* 9 */
{5 * 32, 6}, /* 10 */
{6 * 32, 5}, /* 11 */
{7 * 32, 4}, /* 12 */
{8 * 32, 3}, /* 13 */
{9 * 32, 2}, /* 14 */
{10 * 32, 1},/* 15 */
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
/* Important to note:
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
* bytes we willingly and purposefully load uninitialized that we swizzle over
* in a vector register, anyway. If what we assume is wrong about what is used,
* the memory sanitizer will still usefully flag it */
__msan_unpoison(buf + dist, 16 - dist);
ret_vec = _mm_loadu_si128((__m128i*)buf);
*chunk_rem = lut_rem.remval;
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
#define CHUNKSIZE chunksize_ssse3
#define CHUNKMEMSET chunkmemset_ssse3
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_ssse3
#include "inffast_tpl.h"
#endif

@ -0,0 +1,63 @@
/* compare256_avx2.c -- AVX2 version of compare256
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
__m256i ymm_src0, ymm_src1, ymm_cmp;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
return compare256_avx2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_avx2
#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
#endif

@ -0,0 +1,96 @@
/* compare256_sse2.c -- SSE2 version of compare256
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "fallback_builtins.h"
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
#include <emmintrin.h>
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
int align_offset = ((uintptr_t)src0) & 15;
const uint8_t *end0 = src0 + 256;
const uint8_t *end1 = src1 + 256;
__m128i xmm_src0, xmm_src1, xmm_cmp;
/* Do the first load unaligned, than all subsequent ones we have at least
* one aligned load. Sadly aligning both loads is probably unrealistic */
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
int align_adv = 16 - align_offset;
len += align_adv;
src0 += align_adv;
src1 += align_adv;
/* Do a flooring division (should just be a shift right) */
int num_iter = (256 - len) / 16;
for (int i = 0; i < num_iter; ++i) {
xmm_src0 = _mm_load_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
/* Compiler _may_ turn this branch into a ptest + movemask,
* since a lot of those uops are shared and fused */
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
len += 16, src0 += 16, src1 += 16;
}
if (align_offset) {
src0 = end0 - 16;
src1 = end1 - 16;
len = 256 - 16;
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
if (mask != 0xFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
}
return 256;
}
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
return compare256_sse2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_sse2
#define COMPARE256 compare256_sse2_static
#include "match_tpl.h"
#endif

@ -0,0 +1,186 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
#endif
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
#ifdef COPY
char ALIGNED_(16) partial_buf[16] = { 0 };
#else
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
#endif
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
#ifdef COPY
if (len == 0)
return;
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
#endif
goto partial;
}
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
dst += algn_diff;
#else
XOR_INITIAL128(xmm_crc_part);
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
}
#endif
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
src += algn_diff;
len -= algn_diff;
}
#ifdef X86_VPCLMULQDQ
if (len >= 256) {
#ifdef COPY
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
dst += n;
#else
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
xmm_initial, first);
first = 0;
#endif
len -= n;
src += n;
}
#endif
while (len >= 64) {
len -= 64;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
src += 64;
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
dst += 64;
#else
XOR_INITIAL128(xmm_t0);
#endif
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
}
/*
* len = num bytes left - 64
*/
if (len >= 48) {
len -= 48;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
src += 48;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
dst += 48;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
} else if (len >= 32) {
len -= 32;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
src += 32;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
dst += 32;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
} else if (len >= 16) {
len -= 16;
xmm_t0 = _mm_load_si128((__m128i *)src);
src += 16;
#ifdef COPY
_mm_storeu_si128((__m128i *)dst, xmm_t0);
dst += 16;
#else
XOR_INITIAL128(xmm_t0);
#endif
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
}
partial:
if (len) {
memcpy(&xmm_crc_part, src, len);
#ifdef COPY
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
memcpy(dst, partial_buf, len);
#endif
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
}
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
}

@ -0,0 +1,107 @@
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef COPY
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
#else
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
__m128i init_crc, int32_t first) {
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
#endif
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
size_t len_tmp = len;
const __m512i zmm_fold4 = _mm512_set4_epi32(
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
#ifndef COPY
XOR_INITIAL512(zmm_t0);
#endif
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
dst += 256;
#endif
len -= 256;
src += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
dst += 256;
#endif
len -= 256;
src += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}

@ -0,0 +1,30 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_PCLMULQDQ_CRC
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
#define CRC32_FOLD crc32_fold_pclmulqdq
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
#define CRC32 crc32_pclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

@ -0,0 +1,363 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
#ifdef X86_VPCLMULQDQ
# include <immintrin.h>
#endif
#include "../../crc32_fold.h"
#include "../../crc32_braid_p.h"
#include "x86_intrins.h"
#include <assert.h>
#ifdef X86_VPCLMULQDQ
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
int32_t first);
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
#endif
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc3, ps_res;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
*xmm_crc0 = *xmm_crc1;
*xmm_crc1 = *xmm_crc2;
*xmm_crc2 = x_tmp3;
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3, x_tmp2;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
x_tmp3 = *xmm_crc3;
x_tmp2 = *xmm_crc2;
*xmm_crc3 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
*xmm_crc2 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
*xmm_crc0 = x_tmp2;
*xmm_crc1 = x_tmp3;
*xmm_crc2 = _mm_castps_si128(ps_res20);
*xmm_crc3 = _mm_castps_si128(ps_res31);
}
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc2;
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
*xmm_crc2 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
*xmm_crc1 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
*xmm_crc0 = x_tmp3;
*xmm_crc1 = _mm_castps_si128(ps_res10);
*xmm_crc2 = _mm_castps_si128(ps_res21);
*xmm_crc3 = _mm_castps_si128(ps_res32);
}
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
x_tmp0 = *xmm_crc0;
x_tmp1 = *xmm_crc1;
x_tmp2 = *xmm_crc2;
x_tmp3 = *xmm_crc3;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_t0 = _mm_castsi128_ps(x_tmp0);
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_t1 = _mm_castsi128_ps(x_tmp1);
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_t2 = _mm_castsi128_ps(x_tmp2);
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_t3 = _mm_castsi128_ps(x_tmp3);
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
*xmm_crc0 = _mm_castps_si128(ps_res0);
*xmm_crc1 = _mm_castps_si128(ps_res1);
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
};
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
__m128i xmm_a0_0, xmm_a0_1;
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
xmm_shr = xmm_shl;
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
ps_res = _mm_xor_ps(ps_res, psa0_1);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
*fold0 = _mm_load_si128(fold + 0);
*fold1 = _mm_load_si128(fold + 1);
*fold2 = _mm_load_si128(fold + 2);
*fold3 = _mm_load_si128(fold + 3);
}
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
const __m128i *fold2, const __m128i *fold3) {
_mm_storeu_si128(fold + 0, *fold0);
_mm_storeu_si128(fold + 1, *fold1);
_mm_storeu_si128(fold + 2, *fold2);
_mm_storeu_si128(fold + 3, *fold3);
}
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_zero = _mm_setzero_si128();
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
return 0;
}
#define ONCE(op) if (first) { first = 0; op; }
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
#ifdef X86_VPCLMULQDQ
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
#endif
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
#define COPY
#ifdef X86_VPCLMULQDQ
# include "crc32_fold_vpclmulqdq_tpl.h"
#endif
#include "crc32_fold_pclmulqdq_tpl.h"
static const unsigned ALIGNED_(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
0xccaa009e, 0x00000000, /* rk5 */
0x63cd6124, 0x00000001, /* rk6 */
0xf7011640, 0x00000001, /* rk7 */
0xdb710640, 0x00000001 /* rk8 */
};
static const unsigned ALIGNED_(16) crc_mask[4] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
};
static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
/*
* k1
*/
crc_fold = _mm_load_si128((__m128i *)crc_k);
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
/*
* k5
*/
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
/*
* k7
*/
xmm_crc1 = xmm_crc3;
xmm_crc2 = xmm_crc3;
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
xmm_crc2 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
return crc->value;
}
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
if (len < 64)
return PREFIX(crc32_braid)(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
CRC32_FOLD_RESET(&crc_state);
CRC32_FOLD(&crc_state, buf, len, crc32);
return CRC32_FOLD_FINAL(&crc_state);
}

@ -0,0 +1,17 @@
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
#define X86_VPCLMULQDQ
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
#define CRC32_FOLD crc32_fold_vpclmulqdq
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
#define CRC32 crc32_vpclmulqdq
#include "crc32_pclmulqdq_tpl.h"
#endif

@ -0,0 +1,24 @@
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef X86_SSE42
#include "../../zbuild.h"
#include <nmmintrin.h>
#include "../../deflate.h"
#define HASH_CALC(s, h, val)\
h = _mm_crc32_u32(h, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash_sse42
#define INSERT_STRING insert_string_sse42
#define QUICK_INSERT_STRING quick_insert_string_sse42
#include "../../insert_string_tpl.h"
#endif

@ -0,0 +1,39 @@
/*
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
* Mika T. Lindqvist <postmaster@raasu.org>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
table += entries;
table -= 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)table);
result = _mm256_subs_epu16(value, wsize);
_mm256_storeu_si256((__m256i *)table, result);
table -= 16;
entries -= 16;
} while (entries > 0);
}
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
uint16_t wsize = (uint16_t)s->w_size;
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
slide_hash_chain(s->prev, wsize, ymm_wsize);
}

@ -0,0 +1,62 @@
/*
* SSE optimized hash slide
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
#include <assert.h>
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
uint32_t entries1, const __m128i wsize) {
uint32_t entries;
Pos *table;
__m128i value0, value1, result0, result1;
int on_chain = 0;
next_chain:
table = (on_chain) ? table1 : table0;
entries = (on_chain) ? entries1 : entries0;
table += entries;
table -= 16;
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
* Our alloc function is aligned to 64 byte boundaries */
do {
value0 = _mm_load_si128((__m128i *)table);
value1 = _mm_load_si128((__m128i *)(table + 8));
result0 = _mm_subs_epu16(value0, wsize);
result1 = _mm_subs_epu16(value1, wsize);
_mm_store_si128((__m128i *)table, result0);
_mm_store_si128((__m128i *)(table + 8), result1);
table -= 16;
entries -= 16;
} while (entries > 0);
++on_chain;
if (on_chain > 1) {
return;
} else {
goto next_chain;
}
}
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
uint16_t wsize = (uint16_t)s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
assert(((uintptr_t)s->head & 15) == 0);
assert(((uintptr_t)s->prev & 15) == 0);
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
}

@ -0,0 +1,97 @@
/* x86_features.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
* Jim Kukunas
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "x86_features.h"
#ifdef _MSC_VER
# include <intrin.h>
#else
// Newer versions of GCC and clang come with cpuid.h
# include <cpuid.h>
#endif
#include <string.h>
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuid((int *)registers, info);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuidex((int *)registers, info, subinfo);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
static inline uint64_t xgetbv(unsigned int xcr) {
#ifdef _MSC_VER
return _xgetbv(xcr);
#else
uint32_t eax, edx;
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
return (uint64_t)(edx) << 32 | eax;
#endif
}
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
features->has_sse2 = edx & 0x4000000;
features->has_ssse3 = ecx & 0x200;
features->has_sse42 = ecx & 0x100000;
features->has_pclmulqdq = ecx & 0x2;
if (ecx & 0x08000000) {
uint64_t xfeature = xgetbv(0);
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
}
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
features->has_vpclmulqdq = ecx & 0x400;
// check AVX2 bit if the OS supports saving YMM registers
if (features->has_os_save_ymm) {
features->has_avx2 = ebx & 0x20;
}
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512 = ebx & 0x00010000;
features->has_avx512vnni = ecx & 0x800;
}
}
}

@ -0,0 +1,24 @@
/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FEATURES_H_
#define X86_FEATURES_H_
struct x86_cpu_features {
int has_avx2;
int has_avx512;
int has_avx512vnni;
int has_sse2;
int has_ssse3;
int has_sse42;
int has_pclmulqdq;
int has_vpclmulqdq;
int has_os_save_ymm;
int has_os_save_zmm;
};
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
#endif /* CPU_H_ */

@ -0,0 +1,87 @@
#ifndef X86_INTRINS_H
#define X86_INTRINS_H
/* Unfortunately GCC didn't support these things until version 10.
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
*/
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm256_castsi128_si256(r);
}
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm512_castsi128_si512(r);
}
#endif // __AVX512F__
#endif // gcc/AppleClang version test
#endif // __AVX2__
/* GCC <9 is missing some AVX512 intrinsics.
*/
#ifdef __AVX512F__
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
#include <immintrin.h>
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
char __q59, char __q58, char __q57, char __q56,
char __q55, char __q54, char __q53, char __q52,
char __q51, char __q50, char __q49, char __q48,
char __q47, char __q46, char __q45, char __q44,
char __q43, char __q42, char __q41, char __q40,
char __q39, char __q38, char __q37, char __q36,
char __q35, char __q34, char __q33, char __q32,
char __q31, char __q30, char __q29, char __q28,
char __q27, char __q26, char __q25, char __q24,
char __q23, char __q22, char __q21, char __q20,
char __q19, char __q18, char __q17, char __q16,
char __q15, char __q14, char __q13, char __q12,
char __q11, char __q10, char __q09, char __q08,
char __q07, char __q06, char __q05, char __q04,
char __q03, char __q02, char __q01, char __q00) {
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
}
#undef PACK
#endif // gcc version test
#endif // __AVX512F__
/* Missing zero-extension AVX and AVX512 intrinsics.
* Fixed in Microsoft Visual Studio 2017 version 15.7
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
*/
#if defined(_MSC_VER) && _MSC_VER < 1914
#ifdef __AVX2__
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
}
#endif // __AVX2__
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
}
#endif // __AVX512F__
#endif // defined(_MSC_VER) && _MSC_VER < 1914
#endif // include guard X86_INTRINS_H

@ -0,0 +1,42 @@
/* chunkset.c -- inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
typedef uint64_t chunk_t;
#define CHUNK_SIZE 8
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
uint8_t *dest = (uint8_t *)chunk;
memcpy(dest, from, sizeof(uint32_t));
memcpy(dest+4, from, sizeof(uint32_t));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
memcpy(chunk, from, sizeof(uint64_t));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
memcpy(out, chunk, sizeof(uint64_t));
}
#define CHUNKSIZE chunksize_c
#define CHUNKCOPY chunkcopy_c
#define CHUNKUNROLL chunkunroll_c
#define CHUNKMEMSET chunkmemset_c
#define CHUNKMEMSET_SAFE chunkmemset_safe_c
#include "chunkset_tpl.h"
#define INFLATE_FAST inflate_fast_c
#include "inffast_tpl.h"

@ -0,0 +1,200 @@
/* chunkset_tpl.h -- inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include <stdlib.h>
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return sizeof(chunk_t);
}
/* Behave like memcpy, but assume that it's OK to overwrite at least
chunk_t bytes of output even if the length is shorter than this,
that the length is non-zero, and that `from` lags `out` by at least
sizeof chunk_t bytes (or that they don't overlap at all or simply that
the distance is less than the length of the copy).
Aside from better memory bus utilisation, this means that short copies
(chunk_t bytes or fewer) will fall straight through the loop
without iteration, which will hopefully make the branch prediction more
reliable. */
#ifndef HAVE_CHUNKCOPY
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += align;
from += align;
len -= align;
while (len > 0) {
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += sizeof(chunk_t);
from += sizeof(chunk_t);
len -= sizeof(chunk_t);
}
return out;
}
#endif
/* Perform short copies until distance can be rewritten as being at least
sizeof chunk_t.
This assumes that it's OK to overwrite at least the first
2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
This assumption holds because inflate_fast() starts every iteration with at
least 258 bytes of output space available (258 being the maximum length
output from a single token; see inflate_fast()'s assumptions below). */
#ifndef HAVE_CHUNKUNROLL
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
unsigned char const *from = out - *dist;
chunk_t chunk;
while (*dist < *len && *dist < sizeof(chunk_t)) {
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += *dist;
*len -= *dist;
*dist += *dist;
}
return out;
}
#endif
#ifndef HAVE_CHUNK_MAG
/* Loads a magazine to feed into memory of the pattern */
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
/* This code takes string of length dist from "from" and repeats
* it for as many times as can fit in a chunk_t (vector register) */
uint32_t cpy_dist;
uint32_t bytes_remaining = sizeof(chunk_t);
chunk_t chunk_load;
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
while (bytes_remaining) {
cpy_dist = MIN(dist, bytes_remaining);
memcpy(cur_chunk, buf, cpy_dist);
bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively
* counting in this loop, anyway */
*chunk_rem = cpy_dist;
}
return chunk_load;
}
#endif
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}
#endif
uint8_t *from = out - dist;
if (dist == 1) {
memset(out, *from, len);
return out + len;
} else if (dist > sizeof(chunk_t)) {
return CHUNKCOPY(out, out - dist, len);
}
chunk_t chunk_load;
uint32_t chunk_mod = 0;
/* TODO: possibly build up a permutation table for this if not an even modulus */
#ifdef HAVE_CHUNKMEMSET_2
if (dist == 2) {
chunkmemset_2(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_4
if (dist == 4) {
chunkmemset_4(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_8
if (dist == 8) {
chunkmemset_8(from, &chunk_load);
} else if (dist == sizeof(chunk_t)) {
loadchunk(from, &chunk_load);
} else
#endif
{
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
if (chunk_mod == 0) {
while (len >= (2 * sizeof(chunk_t))) {
storechunk(out, &chunk_load);
storechunk(out + sizeof(chunk_t), &chunk_load);
out += 2 * sizeof(chunk_t);
len -= 2 * sizeof(chunk_t);
}
}
/* If we don't have a "dist" length that divides evenly into a vector
* register, we can write the whole vector register but we need only
* advance by the amount of the whole string that fits in our chunk_t.
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
while (len >= sizeof(chunk_t)) {
storechunk(out, &chunk_load);
len -= adv_amount;
out += adv_amount;
}
if (len) {
memcpy(out, &chunk_load, len);
out += len;
}
return out;
}
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
#if !defined(UNALIGNED64_OK)
# if !defined(UNALIGNED_OK)
static const uint32_t align_mask = 7;
# else
static const uint32_t align_mask = 3;
# endif
#endif
len = MIN(len, left);
uint8_t *from = out - dist;
#if !defined(UNALIGNED64_OK)
while (((uintptr_t)out & align_mask) && (len > 0)) {
*out++ = *from++;
--len;
--left;
}
#endif
if (left < (unsigned)(3 * sizeof(chunk_t))) {
while (len > 0) {
*out++ = *from++;
--len;
}
return out;
}
if (len)
return CHUNKMEMSET(out, dist, len);
return out;
}

@ -0,0 +1,543 @@
# detect-intrinsics.cmake -- Detect compiler intrinsics support
# Licensed under the Zlib license, see LICENSE.md for details
macro(check_acle_compiler_flag)
if(MSVC)
# Both ARM and ARM64-targeting msvc support intrinsics, but
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
set(HAVE_ACLE_FLAG TRUE)
endif()
else()
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
endif()
endif()
# Check whether compiler supports ACLE flag
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG FAIL_REGEX "not supported")
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
# Check whether compiler supports ACLE flag
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
endif()
set(CMAKE_REQUIRED_FLAGS)
endif()
endmacro()
macro(check_armv6_compiler_flag)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
if(HAVE_MARCH_ARMV6)
set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
endif()
endif()
endif()
# Check whether compiler supports ARMv6 inline asm
set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"unsigned int f(unsigned int a, unsigned int b) {
unsigned int c;
__asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) );
return (int)c;
}
int main(void) { return f(1,2); }"
HAVE_ARMV6_INLINE_ASM
)
# Check whether compiler supports ARMv6 intrinsics
check_c_source_compiles(
"#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <arm_acle.h>
#endif
unsigned int f(unsigned int a, unsigned int b) {
#if defined(_MSC_VER)
return _arm_uqsub16(a, b);
#else
return __uqsub16(a, b);
#endif
}
int main(void) { return 0; }"
HAVE_ARMV6_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx512_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
else()
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
else()
set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512FLAG "/arch:AVX512")
endif()
# Check whether compiler supports AVX512 intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
__m512i f(__m512i y) {
__m512i x = _mm512_set1_epi8(2);
return _mm512_sub_epi8(x, y);
}
int main(void) { return 0; }"
HAVE_AVX512_INTRIN
)
# Evidently both GCC and clang were late to implementing these
check_c_source_compiles(
"#include <immintrin.h>
__mmask16 f(__mmask16 x) { return _knot_mask16(x); }
int main(void) { return 0; }"
HAVE_MASK_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx512vnni_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
else()
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
# Check whether compiler supports AVX512vnni intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
__m512i f(__m512i x, __m512i y) {
__m512i z = _mm512_setzero_epi32();
return _mm512_dpbusd_epi32(z, x, y);
}
int main(void) { return 0; }"
HAVE_AVX512VNNI_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX2FLAG "-mavx2")
else()
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX2FLAG "-mavx2")
endif()
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
# Check whether compiler supports AVX2 intrinics
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
__m256i f(__m256i x) {
const __m256i y = _mm256_set1_epi16(1);
return _mm256_subs_epu16(x, y);
}
int main(void) { return 0; }"
HAVE_AVX2_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_neon_compiler_flag)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
set(NEONFLAG "-mfpu=neon")
endif()
endif()
endif()
# Check whether compiler supports NEON flag
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#if defined(_M_ARM64) || defined(_M_ARM64EC)
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int main() { return 0; }"
NEON_AVAILABLE FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_neon_ld4_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
set(NEONFLAG "-mfpu=neon")
endif()
endif()
endif()
# Check whether compiler supports loading 4 neon vecs into a register range
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
int32x4x4_t f(int var[16]) { return vld1q_s32_x4(var); }
int main(void) { return 0; }"
NEON_HAS_LD4)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_pclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(PCLMULFLAG "-mpclmul")
endif()
endif()
# Check whether compiler supports PCLMULQDQ intrinsics
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
# The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
#include <wmmintrin.h>
__m128i f(__m128i a, __m128i b) { return _mm_clmulepi64_si128(a, b, 0x10); }
int main(void) { return 0; }"
HAVE_PCLMULQDQ_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
else()
set(HAVE_PCLMULQDQ_INTRIN OFF)
endif()
endmacro()
macro(check_vpclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
endif()
endif()
# Check whether compiler supports VPCLMULQDQ intrinsics
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
#include <wmmintrin.h>
__m512i f(__m512i a) {
__m512i b = _mm512_setzero_si512();
return _mm512_clmulepi64_epi128(a, b, 0x10);
}
int main(void) { return 0; }"
HAVE_VPCLMULQDQ_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
else()
set(HAVE_VPCLMULQDQ_INTRIN OFF)
endif()
endmacro()
macro(check_ppc_intrinsics)
# Check if compiler supports AltiVec
set(CMAKE_REQUIRED_FLAGS "-maltivec ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <altivec.h>
int main(void)
{
vector int a = vec_splats(0);
vector int b = vec_splats(0);
a = vec_add(a, b);
return 0;
}"
HAVE_ALTIVEC
)
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_ALTIVEC)
set(PPCFLAGS "-maltivec")
endif()
set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <altivec.h>
int main(void)
{
vector int a = vec_splats(0);
vector int b = vec_splats(0);
a = vec_add(a, b);
return 0;
}"
HAVE_NOVSX
)
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_NOVSX)
set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
endif()
# Check if we have what we need for AltiVec optimizations
set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <sys/auxv.h>
#ifdef __FreeBSD__
#include <machine/cpu.h>
#endif
int main() {
#ifdef __FreeBSD__
unsigned long hwcap;
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
#else
return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
#endif
}"
HAVE_VMX
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_power8_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(POWER8FLAG "-mcpu=power8")
endif()
endif()
# Check if we have what we need for POWER8 optimizations
set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <sys/auxv.h>
#ifdef __FreeBSD__
#include <machine/cpu.h>
#endif
int main() {
#ifdef __FreeBSD__
unsigned long hwcap;
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
return (hwcap & PPC_FEATURE2_ARCH_2_07);
#else
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
#endif
}"
HAVE_POWER8_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_rvv_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(RISCVFLAG "-march=rv64gcv")
endif()
endif()
# Check whether compiler supports RVV
set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <riscv_vector.h>
int main() {
return 0;
}"
HAVE_RVV_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_s390_intrinsics)
check_c_source_compiles(
"#include <sys/auxv.h>
#ifndef HWCAP_S390_VXRS
#define HWCAP_S390_VXRS HWCAP_S390_VX
#endif
int main() {
return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
}"
HAVE_S390_INTRIN
)
endmacro()
macro(check_power9_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(POWER9FLAG "-mcpu=power9")
endif()
endif()
# Check if we have what we need for POWER9 optimizations
set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <sys/auxv.h>
#ifdef __FreeBSD__
#include <machine/cpu.h>
#endif
int main() {
#ifdef __FreeBSD__
unsigned long hwcap;
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
return (hwcap & PPC_FEATURE2_ARCH_3_00);
#else
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
#endif
}"
HAVE_POWER9_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_sse2_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE2FLAG "-msse2")
else()
set(SSE2FLAG "/arch:SSE2")
endif()
elseif(MSVC)
if(NOT "${ARCH}" MATCHES "x86_64")
set(SSE2FLAG "/arch:SSE2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE2FLAG "-msse2")
endif()
endif()
# Check whether compiler supports SSE2 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
__m128i f(__m128i x, __m128i y) { return _mm_sad_epu8(x, y); }
int main(void) { return 0; }"
HAVE_SSE2_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_ssse3_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSSE3FLAG "-mssse3")
else()
set(SSSE3FLAG "/arch:SSSE3")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSSE3FLAG "-mssse3")
endif()
endif()
# Check whether compiler supports SSSE3 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
__m128i f(__m128i u) {
__m128i v = _mm_set1_epi32(1);
return _mm_hadd_epi32(u, v);
}
int main(void) { return 0; }"
HAVE_SSSE3_INTRIN
)
endmacro()
macro(check_sse42_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE42FLAG "-msse4.2")
else()
set(SSE42FLAG "/arch:SSE4.2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE42FLAG "-msse4.2")
endif()
endif()
# Check whether compiler supports SSE4.2 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <nmmintrin.h>
unsigned int f(unsigned int a, unsigned int b) { return _mm_crc32_u32(a, b); }
int main(void) { return 0; }"
HAVE_SSE42_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_vgfma_intrinsics)
if(NOT NATIVEFLAG)
set(VGFMAFLAG "-march=z13")
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
endif()
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
endif()
endif()
# Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <vecintrin.h>
int main(void) {
unsigned long long a __attribute__((vector_size(16))) = { 0 };
unsigned long long b __attribute__((vector_size(16))) = { 0 };
unsigned char c __attribute__((vector_size(16))) = { 0 };
c = vec_gfmsum_accum_128(a, b, c);
return c[0];
}"
HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_xsave_intrinsics)
if(NOT NATIVEFLAG AND NOT MSVC)
set(XSAVEFLAG "-mxsave")
endif()
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86gprintrin.h>
#endif
unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
int main(void) { return 0; }"
HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
set(CMAKE_REQUIRED_FLAGS)
endmacro()

@ -0,0 +1,19 @@
# fallback-macros.cmake -- CMake fallback macros
# Copyright (C) 2022 Nathan Moinvaziri
# Licensed under the Zlib license, see LICENSE.md for details
# CMake less than version 3.5.2
if(NOT COMMAND add_compile_options)
macro(add_compile_options options)
string(APPEND CMAKE_C_FLAGS ${options})
string(APPEND CMAKE_CXX_FLAGS ${options})
endmacro()
endif()
# CMake less than version 3.14
if(NOT COMMAND add_link_options)
macro(add_link_options options)
string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
endmacro()
endif()

@ -0,0 +1,180 @@
/* compare256.c -- 256 byte memory comparison with match length return
* Copyright (C) 2020 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil_p.h"
#include "fallback_builtins.h"
/* ALIGNED, byte comparison */
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
return compare256_c_static(src0, src1);
}
#define LONGEST_MATCH longest_match_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h"
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_unaligned_16_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
return compare256_unaligned_16_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_16
#define COMPARE256 compare256_unaligned_16_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_unaligned_16
#define COMPARE256 compare256_unaligned_16_static
#include "match_tpl.h"
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint32_t sv, mv, diff;
memcpy(&sv, src0, sizeof(sv));
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint32_t match_byte = __builtin_ctz(diff) / 8;
return len + match_byte;
}
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
return compare256_unaligned_32_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_32
#define COMPARE256 compare256_unaligned_32_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_unaligned_32
#define COMPARE256 compare256_unaligned_32_static
#include "match_tpl.h"
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
/* UNALIGNED64_OK, 64-bit integer comparison */
static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint64_t sv, mv, diff;
memcpy(&sv, src0, sizeof(sv));
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint64_t match_byte = __builtin_ctzll(diff) / 8;
return len + (uint32_t)match_byte;
}
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
return compare256_unaligned_64_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_64
#define COMPARE256 compare256_unaligned_64_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_unaligned_64
#define COMPARE256 compare256_unaligned_64_static
#include "match_tpl.h"
#endif
#endif

@ -0,0 +1,134 @@
/* compare256_rle.h -- 256 byte run-length encoding comparison
* Copyright (C) 2022 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "fallback_builtins.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
/* ALIGNED, byte comparison */
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src1 += 1, len += 1;
} while (len < 256);
return 256;
}
#ifdef UNALIGNED_OK
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
uint16_t src0_cmp, src1_cmp;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
do {
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
if (src0_cmp != src1_cmp)
return len + (*src0 == *src1);
src1 += 2, len += 2;
} while (len < 256);
return 256;
}
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t sv, len = 0;
uint16_t src0_cmp;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
do {
uint32_t mv, diff;
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint32_t match_byte = __builtin_ctz(diff) / 8;
return len + match_byte;
}
src1 += 4, len += 4;
} while (len < 256);
return 256;
}
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
/* 64-bit unaligned integer comparison */
static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp;
uint64_t sv;
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
do {
uint64_t mv, diff;
memcpy(&mv, src1, sizeof(mv));
diff = sv ^ mv;
if (diff) {
uint64_t match_byte = __builtin_ctzll(diff) / 8;
return len + (uint32_t)match_byte;
}
src1 += 8, len += 8;
} while (len < 256);
return 256;
}
#endif
#endif

@ -0,0 +1,98 @@
/* compress.c -- compress a memory buffer
* Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
/* ===========================================================================
* Architecture-specific hooks.
*/
#ifdef S390_DFLTCC_DEFLATE
# include "arch/s390/dfltcc_common.h"
#else
/* Returns the upper bound on compressed data length based on uncompressed data length, assuming default settings.
* Zero means that arch-specific deflation code behaves identically to the regular zlib-ng algorithms. */
# define DEFLATE_BOUND_COMPLEN(source_len) 0
#endif
/* ===========================================================================
Compresses the source buffer into the destination buffer. The level
parameter has the same meaning as in deflateInit. sourceLen is the byte
length of the source buffer. Upon entry, destLen is the total size of the
destination buffer, which must be at least 0.1% larger than sourceLen plus
12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
memory, Z_BUF_ERROR if there was not enough room in the output buffer,
Z_STREAM_ERROR if the level parameter is invalid.
*/
int Z_EXPORT PREFIX(compress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source,
z_uintmax_t sourceLen, int level) {
PREFIX3(stream) stream;
int err;
const unsigned int max = (unsigned int)-1;
z_size_t left;
left = *destLen;
*destLen = 0;
stream.zalloc = NULL;
stream.zfree = NULL;
stream.opaque = NULL;
err = PREFIX(deflateInit)(&stream, level);
if (err != Z_OK)
return err;
stream.next_out = dest;
stream.avail_out = 0;
stream.next_in = (z_const unsigned char *)source;
stream.avail_in = 0;
do {
if (stream.avail_out == 0) {
stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left;
left -= stream.avail_out;
}
if (stream.avail_in == 0) {
stream.avail_in = sourceLen > (unsigned long)max ? max : (unsigned int)sourceLen;
sourceLen -= stream.avail_in;
}
err = PREFIX(deflate)(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
} while (err == Z_OK);
*destLen = stream.total_out;
PREFIX(deflateEnd)(&stream);
return err == Z_STREAM_END ? Z_OK : err;
}
/* ===========================================================================
*/
int Z_EXPORT PREFIX(compress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) {
return PREFIX(compress2)(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
}
/* ===========================================================================
If the default memLevel or windowBits for deflateInit() is changed, then
this function needs to be updated.
*/
z_uintmax_t Z_EXPORT PREFIX(compressBound)(z_uintmax_t sourceLen) {
z_uintmax_t complen = DEFLATE_BOUND_COMPLEN(sourceLen);
if (complen > 0)
/* Architecture-specific code provided an upper bound. */
return complen + ZLIB_WRAPLEN;
#ifndef NO_QUICK_STRATEGY
return sourceLen /* The source size itself */
+ (sourceLen == 0 ? 1 : 0) /* Always at least one byte for any input */
+ (sourceLen < 9 ? 1 : 0) /* One extra byte for lengths less than 9 */
+ DEFLATE_QUICK_OVERHEAD(sourceLen) /* Source encoding overhead, padded to next full byte */
+ DEFLATE_BLOCK_OVERHEAD /* Deflate block overhead bytes */
+ ZLIB_WRAPLEN; /* zlib wrapper */
#else
return sourceLen + (sourceLen >> 4) + 7 + ZLIB_WRAPLEN;
#endif
}

@ -0,0 +1,23 @@
/* cpu_features.c -- CPU architecture feature check
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "cpu_features.h"
#include <string.h>
Z_INTERNAL void cpu_check_features(struct cpu_features *features) {
memset(features, 0, sizeof(struct cpu_features));
#if defined(X86_FEATURES)
x86_check_features(&features->x86);
#elif defined(ARM_FEATURES)
arm_check_features(&features->arm);
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
power_check_features(&features->power);
#elif defined(S390_FEATURES)
s390_check_features(&features->s390);
#elif defined(RISCV_FEATURES)
riscv_check_features(&features->riscv);
#endif
}

@ -0,0 +1,303 @@
/* cpu_features.h -- CPU architecture feature check
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CPU_FEATURES_H_
#define CPU_FEATURES_H_
#include "adler32_fold.h"
#include "crc32_fold.h"
#if defined(X86_FEATURES)
# include "arch/x86/x86_features.h"
# include "fallback_builtins.h"
#elif defined(ARM_FEATURES)
# include "arch/arm/arm_features.h"
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
# include "arch/power/power_features.h"
#elif defined(S390_FEATURES)
# include "arch/s390/s390_features.h"
#elif defined(RISCV_FEATURES)
# include "arch/riscv/riscv_features.h"
#endif
struct cpu_features {
#if defined(X86_FEATURES)
struct x86_cpu_features x86;
#elif defined(ARM_FEATURES)
struct arm_cpu_features arm;
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
struct power_cpu_features power;
#elif defined(S390_FEATURES)
struct s390_cpu_features s390;
#elif defined(RISCV_FEATURES)
struct riscv_cpu_features riscv;
#else
char empty;
#endif
};
extern void cpu_check_features(struct cpu_features *features);
/* adler32 */
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
#ifdef ARM_NEON
extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef PPC_VMX
extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef RISCV_RVV
extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_SSSE3
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX2
extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512
extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512VNNI
extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef POWER8_VSX
extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
#endif
/* adler32 folding */
#ifdef RISCV_RVV
extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2
extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512
extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI
extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
/* CRC32 folding */
#ifdef X86_PCLMULQDQ_CRC
extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
/* memory chunking */
extern uint32_t chunksize_c(void);
extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#ifdef X86_SSE2
extern uint32_t chunksize_sse2(void);
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef X86_SSSE3
extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef X86_AVX2
extern uint32_t chunksize_avx2(void);
extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef ARM_NEON
extern uint32_t chunksize_neon(void);
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef POWER8_VSX
extern uint32_t chunksize_power8(void);
extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef RISCV_RVV
extern uint32_t chunksize_rvv(void);
extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef ZLIB_COMPAT
typedef struct z_stream_s z_stream;
#else
typedef struct zng_stream_s zng_stream;
#endif
/* inflate fast loop */
extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
#ifdef X86_SSE2
extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSSE3
extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_AVX2
extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_NEON
extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef POWER8_VSX
extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef RISCV_RVV
extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
#endif
/* CRC32 */
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
#ifdef ARM_ACLE
extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
#elif defined(POWER8_VSX)
extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
#elif defined(S390_CRC32_VX)
extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
#endif
/* compare256 */
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
#ifdef HAVE_BUILTIN_CTZ
extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef POWER9
extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef RISCV_RVV
extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef DEFLATE_H_
/* insert_string */
extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
#ifdef X86_SSE42
extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
#elif defined(ARM_ACLE)
extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
#endif
/* longest_match */
extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
#ifdef HAVE_BUILTIN_CTZ
extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
#endif
#ifdef POWER9
extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef RISCV_RVV
extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
#endif
/* longest_match_slow */
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
#ifdef UNALIGNED64_OK
extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
#endif
#ifdef POWER9
extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef RISCV_RVV
extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
#endif
/* quick_insert_string */
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
#ifdef X86_SSE42
extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
#elif defined(ARM_ACLE)
extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
#endif
/* slide_hash */
typedef void (*slide_hash_func)(deflate_state *s);
#ifdef X86_SSE2
extern void slide_hash_sse2(deflate_state *s);
#endif
#if defined(ARM_SIMD)
extern void slide_hash_armv6(deflate_state *s);
#endif
#if defined(ARM_NEON)
extern void slide_hash_neon(deflate_state *s);
#endif
#if defined(PPC_VMX)
extern void slide_hash_vmx(deflate_state *s);
#endif
#if defined(POWER8_VSX)
extern void slide_hash_power8(deflate_state *s);
#endif
#if defined(RISCV_RVV)
extern void slide_hash_rvv(deflate_state *s);
#endif
#ifdef X86_AVX2
extern void slide_hash_avx2(deflate_state *s);
#endif
/* update_hash */
extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
#ifdef X86_SSE42
extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
#elif defined(ARM_ACLE)
extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
#endif
#endif
#endif

@ -0,0 +1,267 @@
/* crc32_braid.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
/* ========================================================================= */
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
return (const uint32_t *)crc_table;
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return functable.crc32(crc, buf, len);
}
#endif
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
return PREFIX(crc32_z)(crc, buf, len);
}
#endif
/* ========================================================================= */
/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
running sparse CRCs are calculated respectively on each braid, at these
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
This is done starting at a word boundary, and continues until as many blocks
of N * W bytes as are available have been processed. The results are combined
into a single CRC at the end. For this code, N must be in the range 1..6 and
W must be 4 or 8. The upper limit on N can be increased if desired by adding
more #if blocks, extending the patterns apparent in the code. In addition,
crc32 tables would need to be regenerated, if the maximum N value is increased.
N and W are chosen empirically by benchmarking the execution time on a given
processor. The choices for N and W below were based on testing on Intel Kaby
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
Octeon II processors. The Intel, AMD, and ARM processors were all fastest
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
They were all tested with either gcc or clang, all using the -O3 optimization
level. Your mileage may vary.
*/
/* ========================================================================= */
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
#else
# error "No endian defined"
#endif
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
/* ========================================================================= */
#ifdef W
/*
Return the CRC of the W bytes in the word_t data, taking the
least-significant byte of the word as the first byte of data, without any pre
or post conditioning. This is used to combine the CRCs of each braid.
*/
#if BYTE_ORDER == LITTLE_ENDIAN
static uint32_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (uint32_t)data;
}
#elif BYTE_ORDER == BIG_ENDIAN
static z_word_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
data = (data << 8) ^
crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
return data;
}
#endif /* BYTE_ORDER */
#endif /* W */
/* ========================================================================= */
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
/* Pre-condition the CRC */
c = (~crc) & 0xffffffff;
#ifdef W
/* If provided enough bytes, do a braided CRC calculation. */
if (len >= N * W + W - 1) {
size_t blks;
z_word_t const *words;
int k;
/* Compute the CRC up to a z_word_t boundary. */
while (len && ((uintptr_t)buf & (W - 1)) != 0) {
len--;
DO1;
}
/* Compute the CRC on as many N z_word_t blocks as are available. */
blks = len / (N * W);
len -= blks * N * W;
words = (z_word_t const *)buf;
z_word_t crc0, word0, comb;
#if N > 1
z_word_t crc1, word1;
#if N > 2
z_word_t crc2, word2;
#if N > 3
z_word_t crc3, word3;
#if N > 4
z_word_t crc4, word4;
#if N > 5
z_word_t crc5, word5;
#endif
#endif
#endif
#endif
#endif
/* Initialize the CRC for each braid. */
crc0 = ZSWAPWORD(c);
#if N > 1
crc1 = 0;
#if N > 2
crc2 = 0;
#if N > 3
crc3 = 0;
#if N > 4
crc4 = 0;
#if N > 5
crc5 = 0;
#endif
#endif
#endif
#endif
#endif
/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
#if N > 1
word1 = crc1 ^ words[1];
#if N > 2
word2 = crc2 ^ words[2];
#if N > 3
word3 = crc3 ^ words[3];
#if N > 4
word4 = crc4 ^ words[4];
#if N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
words += N;
/* Compute and update the CRC for each word. The loop should get unrolled. */
crc0 = BRAID_TABLE[0][word0 & 0xff];
#if N > 1
crc1 = BRAID_TABLE[0][word1 & 0xff];
#if N > 2
crc2 = BRAID_TABLE[0][word2 & 0xff];
#if N > 3
crc3 = BRAID_TABLE[0][word3 & 0xff];
#if N > 4
crc4 = BRAID_TABLE[0][word4 & 0xff];
#if N > 5
crc5 = BRAID_TABLE[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
for (k = 1; k < W; k++) {
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
#if N > 1
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
#if N > 2
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
#if N > 3
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
#if N > 4
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
#if N > 5
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
#endif
#endif
#endif
}
}
/* Process the last block, combining the CRCs of the N braids at the same time. */
comb = crc_word(crc0 ^ words[0]);
#if N > 1
comb = crc_word(crc1 ^ words[1] ^ comb);
#if N > 2
comb = crc_word(crc2 ^ words[2] ^ comb);
#if N > 3
comb = crc_word(crc3 ^ words[3] ^ comb);
#if N > 4
comb = crc_word(crc4 ^ words[4] ^ comb);
#if N > 5
comb = crc_word(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
words += N;
c = ZSWAPWORD(comb);
/* Update the pointer to the remaining bytes to process. */
buf = (const unsigned char *)words;
}
#endif /* W */
/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
len -= 8;
DO8;
}
while (len) {
len--;
DO1;
}
/* Return the CRC, post-conditioned. */
return c ^ 0xffffffff;
}

@ -0,0 +1,57 @@
/* crc32_braid_comb.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/
#include "zbuild.h"
#include "zutil.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "crc32_braid_comb_p.h"
/* ========================================================================= */
static uint32_t crc32_combine_(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
}
static uint32_t crc32_combine_gen_(z_off64_t len2) {
return x2nmodp(len2, 3);
}
static uint32_t crc32_combine_op_(uint32_t crc1, uint32_t crc2, const uint32_t op) {
return multmodp(op, crc1) ^ crc2;
}
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off_t len2) {
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
}
unsigned long Z_EXPORT PREFIX4(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off64_t len2) {
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
}
unsigned long Z_EXPORT PREFIX(crc32_combine_gen)(z_off_t len2) {
return crc32_combine_gen_(len2);
}
unsigned long Z_EXPORT PREFIX4(crc32_combine_gen)(z_off64_t len2) {
return crc32_combine_gen_(len2);
}
unsigned long Z_EXPORT PREFIX(crc32_combine_op)(unsigned long crc1, unsigned long crc2, const unsigned long op) {
return (unsigned long)crc32_combine_op_((uint32_t)crc1, (uint32_t)crc2, (uint32_t)op);
}
#else
uint32_t Z_EXPORT PREFIX4(crc32_combine)(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
return crc32_combine_(crc1, crc2, len2);
}
uint32_t Z_EXPORT PREFIX(crc32_combine_gen)(z_off64_t len2) {
return crc32_combine_gen_(len2);
}
uint32_t Z_EXPORT PREFIX(crc32_combine_op)(uint32_t crc1, uint32_t crc2, const uint32_t op) {
return crc32_combine_op_(crc1, crc2, op);
}
#endif
/* ========================================================================= */

@ -0,0 +1,42 @@
#ifndef CRC32_BRAID_COMB_P_H_
#define CRC32_BRAID_COMB_P_H_
/*
Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
reflected. For speed, this requires that a not be zero.
*/
static uint32_t multmodp(uint32_t a, uint32_t b) {
uint32_t m, p;
m = (uint32_t)1 << 31;
p = 0;
for (;;) {
if (a & m) {
p ^= b;
if ((a & (m - 1)) == 0)
break;
}
m >>= 1;
b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
}
return p;
}
/*
Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
initialized.
*/
static uint32_t x2nmodp(z_off64_t n, unsigned k) {
uint32_t p;
p = (uint32_t)1 << 31; /* x^0 == 1 */
while (n) {
if (n & 1)
p = multmodp(x2n_table[k & 31], p);
n >>= 1;
k++;
}
return p;
}
#endif /* CRC32_BRAID_COMB_P_H_ */

@ -0,0 +1,50 @@
#ifndef CRC32_BRAID_P_H_
#define CRC32_BRAID_P_H_
#include "zbuild.h"
#include "zendian.h"
/* Define N */
#ifdef Z_TESTN
# define N Z_TESTN
#else
# define N 5
#endif
#if N < 1 || N > 6
# error N must be in 1..6
#endif
/*
Define W and the associated z_word_t type. If W is not defined, then a
braided calculation is not used, and the associated tables and code are not
compiled.
*/
#ifdef Z_TESTW
# if Z_TESTW-1 != -1
# define W Z_TESTW
# endif
#else
# ifndef W
# if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
# define W 8
# else
# define W 4
# endif
# endif
#endif
#ifdef W
# if W == 8
typedef uint64_t z_word_t;
# else
# undef W
# define W 4
typedef uint32_t z_word_t;
# endif
#endif
/* CRC polynomial. */
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
#endif /* CRC32_BRAID_P_H_ */

File diff suppressed because it is too large Load Diff

@ -0,0 +1,33 @@
/* crc32_fold.c -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "crc32_fold.h"
#include <limits.h>
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
crc->value = CRC32_INITIAL_VALUE;
return crc->value;
}
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
crc->value = functable.crc32(crc->value, src, len);
memcpy(dst, src, len);
}
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
/* Note: while this is basically the same thing as the vanilla CRC function, we still need
* a functable entry for it so that we can generically dispatch to this function with the
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
* init_crc is an unused argument in this context */
Z_UNUSED(init_crc);
crc->value = functable.crc32(crc->value, src, len);
}
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
return crc->value;
}

@ -0,0 +1,21 @@
/* crc32_fold.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_FOLD_H_
#define CRC32_FOLD_H_
#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */
typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
#endif

File diff suppressed because it is too large Load Diff

@ -0,0 +1,408 @@
#ifndef DEFLATE_H_
#define DEFLATE_H_
/* deflate.h -- internal compression state
* Copyright (C) 1995-2016 Jean-loup Gailly
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* WARNING: this file should *not* be used by applications. It is
part of the implementation of the compression library and is
subject to change. Applications should only use zlib.h.
*/
#include "zutil.h"
#include "zendian.h"
#include "adler32_fold.h"
#include "crc32_fold.h"
/* define NO_GZIP when compiling if you want to disable gzip header and
trailer creation by deflate(). NO_GZIP would be used to avoid linking in
the crc code when it is not needed. For shared libraries, gzip encoding
should be left enabled. */
#ifndef NO_GZIP
# define GZIP
#endif
/* ===========================================================================
* Internal compression state.
*/
#define LENGTH_CODES 29
/* number of length codes, not counting the special END_BLOCK code */
#define LITERALS 256
/* number of literal bytes 0..255 */
#define L_CODES (LITERALS+1+LENGTH_CODES)
/* number of Literal or Length codes, including the END_BLOCK code */
#define D_CODES 30
/* number of distance codes */
#define BL_CODES 19
/* number of codes used to transfer the bit lengths */
#define HEAP_SIZE (2*L_CODES+1)
/* maximum heap size */
#define BIT_BUF_SIZE 64
/* size of bit buffer in bi_buf */
#define END_BLOCK 256
/* end of block literal code */
#define INIT_STATE 1 /* zlib header -> BUSY_STATE */
#ifdef GZIP
# define GZIP_STATE 4 /* gzip header -> BUSY_STATE | EXTRA_STATE */
# define EXTRA_STATE 5 /* gzip extra block -> NAME_STATE */
# define NAME_STATE 6 /* gzip file name -> COMMENT_STATE */
# define COMMENT_STATE 7 /* gzip comment -> HCRC_STATE */
# define HCRC_STATE 8 /* gzip header CRC -> BUSY_STATE */
#endif
#define BUSY_STATE 2 /* deflate -> FINISH_STATE */
#define FINISH_STATE 3 /* stream complete */
#ifdef GZIP
# define MAX_STATE HCRC_STATE
#else
# define MAX_STATE FINISH_STATE
#endif
/* Stream status */
#define HASH_BITS 16u /* log2(HASH_SIZE) */
#ifndef HASH_SIZE
# define HASH_SIZE 65536u /* number of elements in hash table */
#endif
#define HASH_MASK (HASH_SIZE - 1u) /* HASH_SIZE-1 */
/* Data structure describing a single value and its code string. */
typedef struct ct_data_s {
union {
uint16_t freq; /* frequency count */
uint16_t code; /* bit string */
} fc;
union {
uint16_t dad; /* father node in Huffman tree */
uint16_t len; /* length of bit string */
} dl;
} ct_data;
#define Freq fc.freq
#define Code fc.code
#define Dad dl.dad
#define Len dl.len
typedef struct static_tree_desc_s static_tree_desc;
typedef struct tree_desc_s {
ct_data *dyn_tree; /* the dynamic tree */
int max_code; /* largest code with non zero frequency */
const static_tree_desc *stat_desc; /* the corresponding static tree */
} tree_desc;
typedef uint16_t Pos;
/* A Pos is an index in the character window. We use short instead of int to
* save space in the various tables.
*/
/* Type definitions for hash callbacks */
typedef struct internal_state deflate_state;
typedef uint32_t (* update_hash_cb) (deflate_state *const s, uint32_t h, uint32_t val);
typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count);
typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
struct internal_state {
PREFIX3(stream) *strm; /* pointer back to this zlib stream */
unsigned char *pending_buf; /* output still pending */
unsigned char *pending_out; /* next pending byte to output to the stream */
uint32_t pending_buf_size; /* size of pending_buf */
uint32_t pending; /* nb of bytes in the pending buffer */
int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
uint32_t gzindex; /* where in extra, name, or comment */
PREFIX(gz_headerp) gzhead; /* gzip header information to write */
int status; /* as the name implies */
int last_flush; /* value of flush param for previous deflate call */
int reproducible; /* Whether reproducible compression results are required. */
int block_open;
/* Whether or not a block is currently open for the QUICK deflation scheme.
* This is set to 1 if there is an active block, or 0 if the block was just closed.
*/
/* used by deflate.c: */
unsigned int w_size; /* LZ77 window size (32K by default) */
unsigned int w_bits; /* log2(w_size) (8..16) */
unsigned int w_mask; /* w_size - 1 */
unsigned int lookahead; /* number of valid bytes ahead in window */
unsigned int high_water;
/* High water mark offset in window for initialized bytes -- bytes above
* this are set to zero in order to avoid memory check warnings when
* longest match routines access bytes past the input. This is then
* updated to the new high water mark.
*/
unsigned int window_size;
/* Actual size of window: 2*wSize, except when the user input buffer
* is directly used as sliding window.
*/
unsigned char *window;
/* Sliding window. Input bytes are read into the second half of the window,
* and move to the first half later to keep a dictionary of at least wSize
* bytes. With this organization, matches are limited to a distance of
* wSize-STD_MAX_MATCH bytes, but this ensures that IO is always
* performed with a length multiple of the block size. Also, it limits
* the window size to 64K, which is quite useful on MSDOS.
* To do: use the user input buffer as sliding window.
*/
Pos *prev;
/* Link to older string with same hash index. To limit the size of this
* array to 64K, this link is maintained only for the last 32K strings.
* An index in this array is thus a window index modulo 32K.
*/
Pos *head; /* Heads of the hash chains or 0. */
uint32_t ins_h; /* hash index of string to be inserted */
int block_start;
/* Window position at the beginning of the current output block. Gets
* negative when the window is moved backwards.
*/
unsigned int match_length; /* length of best match */
Pos prev_match; /* previous match */
int match_available; /* set if previous match exists */
unsigned int strstart; /* start of string to insert */
unsigned int match_start; /* start of matching string */
unsigned int prev_length;
/* Length of the best match at previous step. Matches not greater than this
* are discarded. This is used in the lazy match evaluation.
*/
unsigned int max_chain_length;
/* To speed up deflation, hash chains are never searched beyond this length.
* A higher limit improves compression ratio but degrades the speed.
*/
unsigned int max_lazy_match;
/* Attempt to find a better match only when the current match is strictly smaller
* than this value. This mechanism is used only for compression levels >= 4.
*/
# define max_insert_length max_lazy_match
/* Insert new strings in the hash table only if the match length is not
* greater than this length. This saves time but degrades compression.
* max_insert_length is used only for compression levels <= 3.
*/
update_hash_cb update_hash;
insert_string_cb insert_string;
quick_insert_string_cb quick_insert_string;
/* Hash function callbacks that can be configured depending on the deflate
* algorithm being used */
int level; /* compression level (1..9) */
int strategy; /* favor or force Huffman coding*/
unsigned int good_match;
/* Use a faster search when the previous match is longer than this */
int nice_match; /* Stop searching when current match exceeds this */
struct crc32_fold_s ALIGNED_(16) crc_fold;
/* used by trees.c: */
/* Didn't use ct_data typedef below to suppress compiler warning */
struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
struct tree_desc_s l_desc; /* desc. for literal tree */
struct tree_desc_s d_desc; /* desc. for distance tree */
struct tree_desc_s bl_desc; /* desc. for bit length tree */
uint16_t bl_count[MAX_BITS+1];
/* number of codes at each bit length for an optimal tree */
int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
int heap_len; /* number of elements in the heap */
int heap_max; /* element of largest frequency */
/* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
* The same heap array is used to build all trees.
*/
unsigned char depth[2*L_CODES+1];
/* Depth of each subtree used as tie breaker for trees of equal frequency
*/
unsigned int lit_bufsize;
/* Size of match buffer for literals/lengths. There are 4 reasons for
* limiting lit_bufsize to 64K:
* - frequencies can be kept in 16 bit counters
* - if compression is not successful for the first block, all input
* data is still in the window so we can still emit a stored block even
* when input comes from standard input. (This can also be done for
* all blocks if lit_bufsize is not greater than 32K.)
* - if compression is not successful for a file smaller than 64K, we can
* even emit a stored file instead of a stored block (saving 5 bytes).
* This is applicable only for zip (not gzip or zlib).
* - creating new Huffman trees less frequently may not provide fast
* adaptation to changes in the input data statistics. (Take for
* example a binary file with poorly compressible code followed by
* a highly compressible string table.) Smaller buffer sizes give
* fast adaptation but have of course the overhead of transmitting
* trees more frequently.
* - I can't count above 4
*/
unsigned char *sym_buf; /* buffer for distances and literals/lengths */
unsigned int sym_next; /* running index in sym_buf */
unsigned int sym_end; /* symbol table full when sym_next reaches this */
unsigned long opt_len; /* bit length of current block with optimal trees */
unsigned long static_len; /* bit length of current block with static trees */
unsigned int matches; /* number of string matches in current block */
unsigned int insert; /* bytes at end of window left to insert */
/* compressed_len and bits_sent are only used if ZLIB_DEBUG is defined */
unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
unsigned long bits_sent; /* bit length of compressed data sent mod 2^32 */
/* Reserved for future use and alignment purposes */
char *reserved_p;
uint64_t bi_buf;
/* Output buffer. bits are inserted starting at the bottom (least significant bits). */
int32_t bi_valid;
/* Number of valid bits in bi_buf. All bits above the last valid bit are always zero. */
/* Reserved for future use and alignment purposes */
int32_t reserved[11];
} ALIGNED_(8);
typedef enum {
need_more, /* block not completed, need more input or more output */
block_done, /* block flush performed */
finish_started, /* finish started, need only more output at next deflate */
finish_done /* finish done, accept no more input or output */
} block_state;
/* Output a byte on the stream.
* IN assertion: there is enough room in pending_buf.
*/
#define put_byte(s, c) { \
s->pending_buf[s->pending++] = (unsigned char)(c); \
}
/* ===========================================================================
* Output a short LSB first on the stream.
* IN assertion: there is enough room in pending_buf.
*/
static inline void put_short(deflate_state *s, uint16_t w) {
#if BYTE_ORDER == BIG_ENDIAN
w = ZSWAP16(w);
#endif
memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
s->pending += 2;
}
/* ===========================================================================
* Output a short MSB first on the stream.
* IN assertion: there is enough room in pending_buf.
*/
static inline void put_short_msb(deflate_state *s, uint16_t w) {
#if BYTE_ORDER == LITTLE_ENDIAN
w = ZSWAP16(w);
#endif
memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
s->pending += 2;
}
/* ===========================================================================
* Output a 32-bit unsigned int LSB first on the stream.
* IN assertion: there is enough room in pending_buf.
*/
static inline void put_uint32(deflate_state *s, uint32_t dw) {
#if BYTE_ORDER == BIG_ENDIAN
dw = ZSWAP32(dw);
#endif
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
s->pending += 4;
}
/* ===========================================================================
* Output a 32-bit unsigned int MSB first on the stream.
* IN assertion: there is enough room in pending_buf.
*/
static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
#if BYTE_ORDER == LITTLE_ENDIAN
dw = ZSWAP32(dw);
#endif
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
s->pending += 4;
}
/* ===========================================================================
* Output a 64-bit unsigned int LSB first on the stream.
* IN assertion: there is enough room in pending_buf.
*/
static inline void put_uint64(deflate_state *s, uint64_t lld) {
#if BYTE_ORDER == BIG_ENDIAN
lld = ZSWAP64(lld);
#endif
memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld));
s->pending += 8;
}
#define MIN_LOOKAHEAD (STD_MAX_MATCH + STD_MIN_MATCH + 1)
/* Minimum amount of lookahead, except at the end of the input file.
* See deflate.c for comments about the STD_MIN_MATCH+1.
*/
#define MAX_DIST(s) ((s)->w_size - MIN_LOOKAHEAD)
/* In order to simplify the code, particularly on 16 bit machines, match
* distances are limited to MAX_DIST instead of WSIZE.
*/
#define WIN_INIT STD_MAX_MATCH
/* Number of bytes after end of data in window to initialize in order to avoid
memory checker errors from longest match routines */
void Z_INTERNAL PREFIX(fill_window)(deflate_state *s);
void Z_INTERNAL slide_hash_c(deflate_state *s);
/* in trees.c */
void Z_INTERNAL zng_tr_init(deflate_state *s);
void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
void Z_INTERNAL zng_tr_flush_bits(deflate_state *s);
void Z_INTERNAL zng_tr_align(deflate_state *s);
void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
uint16_t Z_INTERNAL PREFIX(bi_reverse)(unsigned code, int len);
void Z_INTERNAL PREFIX(flush_pending)(PREFIX3(streamp) strm);
#define d_code(dist) ((dist) < 256 ? zng_dist_code[dist] : zng_dist_code[256+((dist)>>7)])
/* Mapping from a distance to a distance code. dist is the distance - 1 and
* must not have side effects. zng_dist_code[256] and zng_dist_code[257] are never
* used.
*/
/* Bit buffer and compress bits calculation debugging */
#ifdef ZLIB_DEBUG
# define cmpr_bits_add(s, len) s->compressed_len += (len)
# define cmpr_bits_align(s) s->compressed_len = (s->compressed_len + 7) & ~7L
# define sent_bits_add(s, bits) s->bits_sent += (bits)
# define sent_bits_align(s) s->bits_sent = (s->bits_sent + 7) & ~7L
#else
# define cmpr_bits_add(s, len) Z_UNUSED(len)
# define cmpr_bits_align(s)
# define sent_bits_add(s, bits) Z_UNUSED(bits)
# define sent_bits_align(s)
#endif
#endif /* DEFLATE_H_ */

@ -0,0 +1,102 @@
/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
/* ===========================================================================
* Compress as much as possible from the input stream, return the current
* block state.
* This function does not perform lazy evaluation of matches and inserts
* new strings in the dictionary only for unmatched strings or for short
* matches. It is used only for the fast compression options.
*/
Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
Pos hash_head; /* head of the hash chain */
int bflush = 0; /* set if current block must be flushed */
int64_t dist;
uint32_t match_len = 0;
for (;;) {
/* Make sure that we always have enough lookahead, except
* at the end of the input file. We need STD_MAX_MATCH bytes
* for the next match, plus WANT_MIN_MATCH bytes to insert the
* string following the next match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
PREFIX(fill_window)(s);
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
return need_more;
}
if (UNLIKELY(s->lookahead == 0))
break; /* flush the current block */
}
/* Insert the string window[strstart .. strstart+2] in the
* dictionary, and set hash_head to the head of the hash chain:
*/
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;
/* Find the longest match, discarding those <= prev_length.
* At this point we have always match length < WANT_MIN_MATCH
*/
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
/* To simplify the code, we prevent matches with the string
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
match_len = functable.longest_match(s, hash_head);
/* longest_match() sets match_start */
}
}
if (match_len >= WANT_MIN_MATCH) {
check_match(s, s->strstart, s->match_start, match_len);
bflush = zng_tr_tally_dist(s, s->strstart - s->match_start, match_len - STD_MIN_MATCH);
s->lookahead -= match_len;
/* Insert new strings in the hash table only if the match length
* is not too large. This saves time but degrades compression.
*/
if (match_len <= s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
match_len--; /* string at strstart already in table */
s->strstart++;
functable.insert_string(s, s->strstart, match_len);
s->strstart += match_len;
} else {
s->strstart += match_len;
functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
/* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.
*/
}
match_len = 0;
} else {
/* No match, output a literal byte */
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
s->lookahead--;
s->strstart++;
}
if (UNLIKELY(bflush))
FLUSH_BLOCK(s, 0);
}
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
if (UNLIKELY(flush == Z_FINISH)) {
FLUSH_BLOCK(s, 1);
return finish_done;
}
if (UNLIKELY(s->sym_next))
FLUSH_BLOCK(s, 0);
return block_done;
}

@ -0,0 +1,45 @@
/* deflate_huff.c -- compress data using huffman encoding only strategy
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
/* ===========================================================================
* For Z_HUFFMAN_ONLY, do not look for matches. Do not maintain a hash table.
* (It will be regenerated if this run of deflate switches away from Huffman.)
*/
Z_INTERNAL block_state deflate_huff(deflate_state *s, int flush) {
int bflush = 0; /* set if current block must be flushed */
for (;;) {
/* Make sure that we have a literal to write. */
if (s->lookahead == 0) {
PREFIX(fill_window)(s);
if (s->lookahead == 0) {
if (flush == Z_NO_FLUSH)
return need_more;
break; /* flush the current block */
}
}
/* Output a literal byte */
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
s->lookahead--;
s->strstart++;
if (bflush)
FLUSH_BLOCK(s, 0);
}
s->insert = 0;
if (flush == Z_FINISH) {
FLUSH_BLOCK(s, 1);
return finish_done;
}
if (s->sym_next)
FLUSH_BLOCK(s, 0);
return block_done;
}

@ -0,0 +1,293 @@
/* deflate_medium.c -- The deflate_medium deflate strategy
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef NO_MEDIUM_STRATEGY
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
struct match {
uint16_t match_start;
uint16_t match_length;
uint16_t strstart;
uint16_t orgstart;
};
static int emit_match(deflate_state *s, struct match match) {
int bflush = 0;
/* matches that are not long enough we need to emit as literals */
if (match.match_length < WANT_MIN_MATCH) {
while (match.match_length) {
bflush += zng_tr_tally_lit(s, s->window[match.strstart]);
s->lookahead--;
match.strstart++;
match.match_length--;
}
return bflush;
}
check_match(s, match.strstart, match.match_start, match.match_length);
bflush += zng_tr_tally_dist(s, match.strstart - match.match_start, match.match_length - STD_MIN_MATCH);
s->lookahead -= match.match_length;
return bflush;
}
static void insert_match(deflate_state *s, struct match match) {
if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
return;
/* matches that are not long enough we need to emit as literals */
if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
match.strstart++;
match.match_length--;
if (UNLIKELY(match.match_length > 0)) {
if (match.strstart >= match.orgstart) {
if (match.strstart + match.match_length - 1 >= match.orgstart) {
functable.insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
match.strstart += match.match_length;
match.match_length = 0;
}
}
return;
}
/* Insert new strings in the hash table only if the match length
* is not too large. This saves time but degrades compression.
*/
if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
match.match_length--; /* string at strstart already in table */
match.strstart++;
if (LIKELY(match.strstart >= match.orgstart)) {
if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
functable.insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
} else if (match.orgstart < match.strstart + match.match_length) {
functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
}
match.strstart += match.match_length;
match.match_length = 0;
} else {
match.strstart += match.match_length;
match.match_length = 0;
if (match.strstart >= (STD_MIN_MATCH - 2))
functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
/* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.
*/
}
}
static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
Pos limit;
unsigned char *match, *orig;
int changed = 0;
struct match c, n;
/* step zero: sanity checks */
if (current->match_length <= 1)
return;
if (UNLIKELY(current->match_length > 1 + next->match_start))
return;
if (UNLIKELY(current->match_length > 1 + next->strstart))
return;
match = s->window - current->match_length + 1 + next->match_start;
orig = s->window - current->match_length + 1 + next->strstart;
/* quick exit check.. if this fails then don't bother with anything else */
if (LIKELY(*match != *orig))
return;
c = *current;
n = *next;
/* step one: try to move the "next" match to the left as much as possible */
limit = next->strstart > MAX_DIST(s) ? next->strstart - (Pos)MAX_DIST(s) : 0;
match = s->window + n.match_start - 1;
orig = s->window + n.strstart - 1;
while (*match == *orig) {
if (UNLIKELY(c.match_length < 1))
break;
if (UNLIKELY(n.strstart <= limit))
break;
if (UNLIKELY(n.match_length >= 256))
break;
if (UNLIKELY(n.match_start <= 1))
break;
n.strstart--;
n.match_start--;
n.match_length++;
c.match_length--;
match--;
orig--;
changed++;
}
if (!changed)
return;
if (c.match_length <= 1 && n.match_length != 2) {
n.orgstart++;
*current = c;
*next = n;
} else {
return;
}
}
Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
/* Align the first struct to start on a new cacheline, this allows us to fit both structs in one cacheline */
ALIGNED_(16) struct match current_match;
struct match next_match;
/* For levels below 5, don't check the next position for a better match */
int early_exit = s->level < 5;
memset(&current_match, 0, sizeof(struct match));
memset(&next_match, 0, sizeof(struct match));
for (;;) {
Pos hash_head = 0; /* head of the hash chain */
int bflush = 0; /* set if current block must be flushed */
int64_t dist;
/* Make sure that we always have enough lookahead, except
* at the end of the input file. We need STD_MAX_MATCH bytes
* for the next match, plus WANT_MIN_MATCH bytes to insert the
* string following the next current_match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
PREFIX(fill_window)(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
return need_more;
}
if (UNLIKELY(s->lookahead == 0))
break; /* flush the current block */
next_match.match_length = 0;
}
/* Insert the string window[strstart .. strstart+2] in the
* dictionary, and set hash_head to the head of the hash chain:
*/
/* If we already have a future match from a previous round, just use that */
if (!early_exit && next_match.match_length > 0) {
current_match = next_match;
next_match.match_length = 0;
} else {
hash_head = 0;
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
}
current_match.strstart = (uint16_t)s->strstart;
current_match.orgstart = current_match.strstart;
/* Find the longest match, discarding those <= prev_length.
* At this point we have always match_length < WANT_MIN_MATCH
*/
dist = (int64_t)s->strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
/* To simplify the code, we prevent matches with the string
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
current_match.match_start = (uint16_t)s->match_start;
if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
current_match.match_length = 1;
if (UNLIKELY(current_match.match_start >= current_match.strstart)) {
/* this can happen due to some restarts */
current_match.match_length = 1;
}
} else {
/* Set up the match to be a 1 byte literal */
current_match.match_start = 0;
current_match.match_length = 1;
}
}
insert_match(s, current_match);
/* now, look ahead one */
if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
s->strstart = current_match.strstart + current_match.match_length;
hash_head = functable.quick_insert_string(s, s->strstart);
next_match.strstart = (uint16_t)s->strstart;
next_match.orgstart = next_match.strstart;
/* Find the longest match, discarding those <= prev_length.
* At this point we have always match_length < WANT_MIN_MATCH
*/
dist = (int64_t)s->strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
/* To simplify the code, we prevent matches with the string
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
next_match.match_start = (uint16_t)s->match_start;
if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
/* this can happen due to some restarts */
next_match.match_length = 1;
}
if (next_match.match_length < WANT_MIN_MATCH)
next_match.match_length = 1;
else
fizzle_matches(s, &current_match, &next_match);
} else {
/* Set up the match to be a 1 byte literal */
next_match.match_start = 0;
next_match.match_length = 1;
}
s->strstart = current_match.strstart;
} else {
next_match.match_length = 0;
}
/* now emit the current match */
bflush = emit_match(s, current_match);
/* move the "cursor" forward */
s->strstart += current_match.match_length;
if (UNLIKELY(bflush))
FLUSH_BLOCK(s, 0);
}
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
if (flush == Z_FINISH) {
FLUSH_BLOCK(s, 1);
return finish_done;
}
if (UNLIKELY(s->sym_next))
FLUSH_BLOCK(s, 0);
return block_done;
}
#endif

@ -0,0 +1,116 @@
/* deflate_p.h -- Private inline functions and macros shared with more than
* one deflate method
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifndef DEFLATE_P_H
#define DEFLATE_P_H
/* Forward declare common non-inlined functions declared in deflate.c */
#ifdef ZLIB_DEBUG
/* ===========================================================================
* Check that the match at match_start is indeed a match.
*/
static inline void check_match(deflate_state *s, Pos start, Pos match, int length) {
/* check that the match length is valid*/
if (length < STD_MIN_MATCH || length > STD_MAX_MATCH) {
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
z_error("invalid match length");
}
/* check that the match isn't at the same position as the start string */
if (match == start) {
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
z_error("invalid match position");
}
/* check that the match is indeed a match */
if (memcmp(s->window + match, s->window + start, length) != 0) {
int32_t i = 0;
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
do {
fprintf(stderr, " %03d: match [%02x] start [%02x]\n", i++,
s->window[match++], s->window[start++]);
} while (--length != 0);
z_error("invalid match");
}
if (z_verbose > 1) {
fprintf(stderr, "\\[%u,%d]", start-match, length);
do {
putc(s->window[start++], stderr);
} while (--length != 0);
}
}
#else
#define check_match(s, start, match, length)
#endif
Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm);
Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
/* ===========================================================================
* Save the match info and tally the frequency counts. Return true if
* the current block must be flushed.
*/
extern const unsigned char Z_INTERNAL zng_length_code[];
extern const unsigned char Z_INTERNAL zng_dist_code[];
static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
/* c is the unmatched char */
s->sym_buf[s->sym_next++] = 0;
s->sym_buf[s->sym_next++] = 0;
s->sym_buf[s->sym_next++] = c;
s->dyn_ltree[c].Freq++;
Tracevv((stderr, "%c", c));
Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
return (s->sym_next == s->sym_end);
}
static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
/* dist: distance of matched string */
/* len: match length-STD_MIN_MATCH */
s->sym_buf[s->sym_next++] = (uint8_t)(dist);
s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
s->sym_buf[s->sym_next++] = (uint8_t)len;
s->matches++;
dist--;
Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
"zng_tr_tally: bad match");
s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
s->dyn_dtree[d_code(dist)].Freq++;
return (s->sym_next == s->sym_end);
}
/* ===========================================================================
* Flush the current block, with given end-of-file flag.
* IN assertion: strstart is set to the end of the current match.
*/
#define FLUSH_BLOCK_ONLY(s, last) { \
zng_tr_flush_block(s, (s->block_start >= 0 ? \
(char *)&s->window[(unsigned)s->block_start] : \
NULL), \
(uint32_t)((int)s->strstart - s->block_start), \
(last)); \
s->block_start = (int)s->strstart; \
PREFIX(flush_pending)(s->strm); \
}
/* Same but force premature exit if necessary. */
#define FLUSH_BLOCK(s, last) { \
FLUSH_BLOCK_ONLY(s, last); \
if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
}
/* Maximum stored block length in deflate format (not including header). */
#define MAX_STORED 65535
/* Compression function. Returns the block state after the call. */
typedef block_state (*compress_func) (deflate_state *s, int flush);
/* Match function. Returns the longest match. */
typedef uint32_t (*match_func) (deflate_state *const s, Pos cur_match);
#endif

@ -0,0 +1,129 @@
/*
* The deflate_quick deflate strategy, designed to be used when cycles are
* at a premium.
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
* Author:
* Phil Vachon <pvachon@12sidedtech.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#include "trees_emit.h"
extern const ct_data static_ltree[L_CODES+2];
extern const ct_data static_dtree[D_CODES];
#define QUICK_START_BLOCK(s, last) { \
zng_tr_emit_tree(s, STATIC_TREES, last); \
s->block_open = 1 + (int)last; \
s->block_start = (int)s->strstart; \
}
#define QUICK_END_BLOCK(s, last) { \
if (s->block_open) { \
zng_tr_emit_end_block(s, static_ltree, last); \
s->block_open = 0; \
s->block_start = (int)s->strstart; \
PREFIX(flush_pending)(s->strm); \
if (s->strm->avail_out == 0) \
return (last) ? finish_started : need_more; \
} \
}
Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
Pos hash_head;
int64_t dist;
unsigned match_len, last;
last = (flush == Z_FINISH) ? 1 : 0;
if (UNLIKELY(last && s->block_open != 2)) {
/* Emit end of previous block */
QUICK_END_BLOCK(s, 0);
/* Emit start of last block */
QUICK_START_BLOCK(s, last);
} else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) {
/* Start new block only when we have lookahead data, so that if no
input data is given an empty block will not be written */
QUICK_START_BLOCK(s, last);
}
for (;;) {
if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) {
PREFIX(flush_pending)(s->strm);
if (s->strm->avail_out == 0) {
return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more;
}
}
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) {
PREFIX(fill_window)(s);
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
return need_more;
}
if (UNLIKELY(s->lookahead == 0))
break;
if (UNLIKELY(s->block_open == 0)) {
/* Start new block when we have lookahead data, so that if no
input data is given an empty block will not be written */
QUICK_START_BLOCK(s, last);
}
}
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
hash_head = functable.quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0) {
const uint8_t *str_start = s->window + s->strstart;
const uint8_t *match_start = s->window + hash_head;
if (zng_memcmp_2(str_start, match_start) == 0) {
match_len = functable.compare256(str_start+2, match_start+2) + 2;
if (match_len >= WANT_MIN_MATCH) {
if (UNLIKELY(match_len > s->lookahead))
match_len = s->lookahead;
if (UNLIKELY(match_len > STD_MAX_MATCH))
match_len = STD_MAX_MATCH;
check_match(s, s->strstart, hash_head, match_len);
zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist);
s->lookahead -= match_len;
s->strstart += match_len;
continue;
}
}
}
}
zng_tr_emit_lit(s, static_ltree, s->window[s->strstart]);
s->strstart++;
s->lookahead--;
}
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
if (UNLIKELY(last)) {
QUICK_END_BLOCK(s, 1);
return finish_done;
}
QUICK_END_BLOCK(s, 0);
return block_done;
}

@ -0,0 +1,85 @@
/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "compare256_rle.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#ifdef UNALIGNED_OK
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
# define compare256_rle compare256_rle_unaligned_64
# elif defined(HAVE_BUILTIN_CTZ)
# define compare256_rle compare256_rle_unaligned_32
# else
# define compare256_rle compare256_rle_unaligned_16
# endif
#else
# define compare256_rle compare256_rle_c
#endif
/* ===========================================================================
* For Z_RLE, simply look for runs of bytes, generate matches only of distance
* one. Do not maintain a hash table. (It will be regenerated if this run of
* deflate switches away from Z_RLE.)
*/
Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
int bflush = 0; /* set if current block must be flushed */
unsigned char *scan; /* scan goes up to strend for length of run */
uint32_t match_len = 0;
for (;;) {
/* Make sure that we always have enough lookahead, except
* at the end of the input file. We need STD_MAX_MATCH bytes
* for the longest run, plus one for the unrolled loop.
*/
if (s->lookahead <= STD_MAX_MATCH) {
PREFIX(fill_window)(s);
if (s->lookahead <= STD_MAX_MATCH && flush == Z_NO_FLUSH)
return need_more;
if (s->lookahead == 0)
break; /* flush the current block */
}
/* See how many times the previous byte repeats */
if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
scan = s->window + s->strstart - 1;
if (scan[0] == scan[1] && scan[1] == scan[2]) {
match_len = compare256_rle(scan, scan+3)+2;
match_len = MIN(match_len, s->lookahead);
match_len = MIN(match_len, STD_MAX_MATCH);
}
Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
}
/* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */
if (match_len >= STD_MIN_MATCH) {
check_match(s, s->strstart, s->strstart - 1, match_len);
bflush = zng_tr_tally_dist(s, 1, match_len - STD_MIN_MATCH);
s->lookahead -= match_len;
s->strstart += match_len;
match_len = 0;
} else {
/* No match, output a literal byte */
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
s->lookahead--;
s->strstart++;
}
if (bflush)
FLUSH_BLOCK(s, 0);
}
s->insert = 0;
if (flush == Z_FINISH) {
FLUSH_BLOCK(s, 1);
return finish_done;
}
if (s->sym_next)
FLUSH_BLOCK(s, 0);
return block_done;
}

@ -0,0 +1,143 @@
/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
/* ===========================================================================
* Same as deflate_medium, but achieves better compression. We use a lazy
* evaluation for matches: a match is finally adopted only if there is
* no better match at the next window position.
*/
Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
Pos hash_head; /* head of hash chain */
int bflush; /* set if current block must be flushed */
int64_t dist;
uint32_t match_len;
match_func *longest_match;
if (s->max_chain_length <= 1024)
longest_match = &functable.longest_match;
else
longest_match = &functable.longest_match_slow;
/* Process the input block. */
for (;;) {
/* Make sure that we always have enough lookahead, except
* at the end of the input file. We need STD_MAX_MATCH bytes
* for the next match, plus WANT_MIN_MATCH bytes to insert the
* string following the next match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
PREFIX(fill_window)(s);
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
return need_more;
}
if (UNLIKELY(s->lookahead == 0))
break; /* flush the current block */
}
/* Insert the string window[strstart .. strstart+2] in the
* dictionary, and set hash_head to the head of the hash chain:
*/
hash_head = 0;
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
hash_head = s->quick_insert_string(s, s->strstart);
}
/* Find the longest match, discarding those <= prev_length.
*/
s->prev_match = (Pos)s->match_start;
match_len = STD_MIN_MATCH - 1;
dist = (int64_t)s->strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0 && s->prev_length < s->max_lazy_match && hash_head != 0) {
/* To simplify the code, we prevent matches with the string
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
match_len = (*longest_match)(s, hash_head);
/* longest_match() sets match_start */
if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
/* If prev_match is also WANT_MIN_MATCH, match_start is garbage
* but we will ignore the current match anyway.
*/
match_len = STD_MIN_MATCH - 1;
}
}
/* If there was a match at the previous step and the current
* match is not better, output the previous match:
*/
if (s->prev_length >= STD_MIN_MATCH && match_len <= s->prev_length) {
unsigned int max_insert = s->strstart + s->lookahead - STD_MIN_MATCH;
/* Do not insert strings in hash table beyond this. */
check_match(s, s->strstart-1, s->prev_match, s->prev_length);
bflush = zng_tr_tally_dist(s, s->strstart -1 - s->prev_match, s->prev_length - STD_MIN_MATCH);
/* Insert in hash table all strings up to the end of the match.
* strstart-1 and strstart are already inserted. If there is not
* enough lookahead, the last two strings are not inserted in
* the hash table.
*/
s->prev_length -= 1;
s->lookahead -= s->prev_length;
unsigned int mov_fwd = s->prev_length - 1;
if (max_insert > s->strstart) {
unsigned int insert_cnt = mov_fwd;
if (UNLIKELY(insert_cnt > max_insert - s->strstart))
insert_cnt = max_insert - s->strstart;
s->insert_string(s, s->strstart + 1, insert_cnt);
}
s->prev_length = 0;
s->match_available = 0;
s->strstart += mov_fwd + 1;
if (UNLIKELY(bflush))
FLUSH_BLOCK(s, 0);
} else if (s->match_available) {
/* If there was no match at the previous position, output a
* single literal. If there was a match but the current match
* is longer, truncate the previous match to a single literal.
*/
bflush = zng_tr_tally_lit(s, s->window[s->strstart-1]);
if (UNLIKELY(bflush))
FLUSH_BLOCK_ONLY(s, 0);
s->prev_length = match_len;
s->strstart++;
s->lookahead--;
if (UNLIKELY(s->strm->avail_out == 0))
return need_more;
} else {
/* There is no previous match to compare with, wait for
* the next step to decide.
*/
s->prev_length = match_len;
s->match_available = 1;
s->strstart++;
s->lookahead--;
}
}
Assert(flush != Z_NO_FLUSH, "no flush?");
if (UNLIKELY(s->match_available)) {
(void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
s->match_available = 0;
}
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
if (UNLIKELY(flush == Z_FINISH)) {
FLUSH_BLOCK(s, 1);
return finish_done;
}
if (UNLIKELY(s->sym_next))
FLUSH_BLOCK(s, 0);
return block_done;
}

@ -0,0 +1,186 @@
/* deflate_stored.c -- store data without compression using deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
/* ===========================================================================
* Copy without compression as much as possible from the input stream, return
* the current block state.
*
* In case deflateParams() is used to later switch to a non-zero compression
* level, s->matches (otherwise unused when storing) keeps track of the number
* of hash table slides to perform. If s->matches is 1, then one hash table
* slide will be done when switching. If s->matches is 2, the maximum value
* allowed here, then the hash table will be cleared, since two or more slides
* is the same as a clear.
*
* deflate_stored() is written to minimize the number of times an input byte is
* copied. It is most efficient with large input and output buffers, which
* maximizes the opportunites to have a single copy from next_in to next_out.
*/
Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
/* Smallest worthy block size when not flushing or finishing. By default
* this is 32K. This can be as small as 507 bytes for memLevel == 1. For
* large input and output buffers, the stored block size will be larger.
*/
unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
/* Copy as many min_block or larger stored blocks directly to next_out as
* possible. If flushing, copy the remaining available input to next_out as
* stored blocks, if there is enough space.
*/
unsigned len, left, have, last = 0;
unsigned used = s->strm->avail_in;
do {
/* Set len to the maximum size block that we can copy directly with the
* available input data and output space. Set left to how much of that
* would be copied from what's left in the window.
*/
len = MAX_STORED; /* maximum deflate stored block length */
have = (s->bi_valid + 42) >> 3; /* number of header bytes */
if (s->strm->avail_out < have) /* need room for header */
break;
/* maximum stored block length that will fit in avail_out: */
have = s->strm->avail_out - have;
left = (int)s->strstart - s->block_start; /* bytes left in window */
if (len > (unsigned long)left + s->strm->avail_in)
len = left + s->strm->avail_in; /* limit len to the input */
len = MIN(len, have); /* limit len to the output */
/* If the stored block would be less than min_block in length, or if
* unable to copy all of the available input when flushing, then try
* copying to the window and the pending buffer instead. Also don't
* write an empty block when flushing -- deflate() does that.
*/
if (len < min_block && ((len == 0 && flush != Z_FINISH) || flush == Z_NO_FLUSH || len != left + s->strm->avail_in))
break;
/* Make a dummy stored block in pending to get the header bytes,
* including any pending bits. This also updates the debugging counts.
*/
last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0;
zng_tr_stored_block(s, (char *)0, 0L, last);
/* Replace the lengths in the dummy stored block with len. */
s->pending -= 4;
put_short(s, (uint16_t)len);
put_short(s, (uint16_t)~len);
/* Write the stored block header bytes. */
PREFIX(flush_pending)(s->strm);
/* Update debugging counts for the data about to be copied. */
cmpr_bits_add(s, len << 3);
sent_bits_add(s, len << 3);
/* Copy uncompressed bytes from the window to next_out. */
if (left) {
left = MIN(left, len);
memcpy(s->strm->next_out, s->window + s->block_start, left);
s->strm->next_out += left;
s->strm->avail_out -= left;
s->strm->total_out += left;
s->block_start += (int)left;
len -= left;
}
/* Copy uncompressed bytes directly from next_in to next_out, updating
* the check value.
*/
if (len) {
PREFIX(read_buf)(s->strm, s->strm->next_out, len);
s->strm->next_out += len;
s->strm->avail_out -= len;
s->strm->total_out += len;
}
} while (last == 0);
/* Update the sliding window with the last s->w_size bytes of the copied
* data, or append all of the copied data to the existing window if less
* than s->w_size bytes were copied. Also update the number of bytes to
* insert in the hash tables, in the event that deflateParams() switches to
* a non-zero compression level.
*/
used -= s->strm->avail_in; /* number of input bytes directly copied */
if (used) {
/* If any input was used, then no unused input remains in the window,
* therefore s->block_start == s->strstart.
*/
if (used >= s->w_size) { /* supplant the previous history */
s->matches = 2; /* clear hash */
memcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
s->strstart = s->w_size;
s->insert = s->strstart;
} else {
if (s->window_size - s->strstart <= used) {
/* Slide the window down. */
s->strstart -= s->w_size;
memcpy(s->window, s->window + s->w_size, s->strstart);
if (s->matches < 2)
s->matches++; /* add a pending slide_hash() */
s->insert = MIN(s->insert, s->strstart);
}
memcpy(s->window + s->strstart, s->strm->next_in - used, used);
s->strstart += used;
s->insert += MIN(used, s->w_size - s->insert);
}
s->block_start = (int)s->strstart;
}
s->high_water = MAX(s->high_water, s->strstart);
/* If the last block was written to next_out, then done. */
if (last)
return finish_done;
/* If flushing and all input has been consumed, then done. */
if (flush != Z_NO_FLUSH && flush != Z_FINISH && s->strm->avail_in == 0 && (int)s->strstart == s->block_start)
return block_done;
/* Fill the window with any remaining input. */
have = s->window_size - s->strstart;
if (s->strm->avail_in > have && s->block_start >= (int)s->w_size) {
/* Slide the window down. */
s->block_start -= (int)s->w_size;
s->strstart -= s->w_size;
memcpy(s->window, s->window + s->w_size, s->strstart);
if (s->matches < 2)
s->matches++; /* add a pending slide_hash() */
have += s->w_size; /* more space now */
s->insert = MIN(s->insert, s->strstart);
}
have = MIN(have, s->strm->avail_in);
if (have) {
PREFIX(read_buf)(s->strm, s->window + s->strstart, have);
s->strstart += have;
s->insert += MIN(have, s->w_size - s->insert);
}
s->high_water = MAX(s->high_water, s->strstart);
/* There was not enough avail_out to write a complete worthy or flushed
* stored block to next_out. Write a stored block to pending instead, if we
* have enough input for a worthy block, or if flushing and there is enough
* room for the remaining input as a stored block in the pending buffer.
*/
have = (s->bi_valid + 42) >> 3; /* number of header bytes */
/* maximum stored block length that will fit in pending: */
have = MIN(s->pending_buf_size - have, MAX_STORED);
min_block = MIN(have, s->w_size);
left = (int)s->strstart - s->block_start;
if (left >= min_block || ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH && s->strm->avail_in == 0 && left <= have)) {
len = MIN(left, have);
last = flush == Z_FINISH && s->strm->avail_in == 0 && len == left ? 1 : 0;
zng_tr_stored_block(s, (char *)s->window + s->block_start, len, last);
s->block_start += (int)len;
PREFIX(flush_pending)(s->strm);
}
/* We've done all we can with the available input and output. */
return last ? finish_started : need_more;
}

@ -0,0 +1,50 @@
#ifndef FALLBACK_BUILTINS_H
#define FALLBACK_BUILTINS_H
#if defined(_MSC_VER) && !defined(__clang__)
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
#include <intrin.h>
#ifdef X86_FEATURES
# include "arch/x86/x86_features.h"
#endif
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
* Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
* If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
* Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
*/
static __forceinline int __builtin_ctz(unsigned int value) {
Assert(value != 0, "Invalid input value: 0");
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
return (int)_tzcnt_u32(value);
# else
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
return (int)trailing_zero;
# endif
}
#define HAVE_BUILTIN_CTZ
#ifdef _M_AMD64
/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
*/
static __forceinline int __builtin_ctzll(unsigned long long value) {
Assert(value != 0, "Invalid input value: 0");
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
return (int)_tzcnt_u64(value);
# else
unsigned long trailing_zero;
_BitScanForward64(&trailing_zero, value);
return (int)trailing_zero;
# endif
}
#define HAVE_BUILTIN_CTZLL
#endif // Microsoft AMD64
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
#endif // _MSC_VER & !clang
#endif // include guard FALLBACK_BUILTINS_H

@ -0,0 +1,403 @@
/* functable.c -- Choose relevant optimized functions at runtime
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zendian.h"
#include "crc32_braid_p.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#include "cpu_features.h"
#if defined(_MSC_VER)
# include <intrin.h>
#endif
/* Platform has pointer size atomic store */
#if defined(__GNUC__) || defined(__clang__)
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
__atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
# define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#elif defined(_MSC_VER)
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
_InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
# if defined(_M_ARM) || defined(_M_ARM64)
# define FUNCTABLE_BARRIER() do { \
_ReadWriteBarrier(); \
__dmb(0xB); /* _ARM_BARRIER_ISH */ \
_ReadWriteBarrier(); \
} while (0)
# else
# define FUNCTABLE_BARRIER() _ReadWriteBarrier()
# endif
#else
# warning Unable to detect atomic intrinsic support.
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
*((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
# define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
#endif
static void force_init_empty(void) {
// empty
}
static void init_functable(void) {
struct functable_s ft;
struct cpu_features cf;
cpu_check_features(&cf);
// Generic code
ft.force_init = &force_init_empty;
ft.adler32 = &adler32_c;
ft.adler32_fold_copy = &adler32_fold_copy_c;
ft.chunkmemset_safe = &chunkmemset_safe_c;
ft.chunksize = &chunksize_c;
ft.crc32 = &PREFIX(crc32_braid);
ft.crc32_fold = &crc32_fold_c;
ft.crc32_fold_copy = &crc32_fold_copy_c;
ft.crc32_fold_final = &crc32_fold_final_c;
ft.crc32_fold_reset = &crc32_fold_reset_c;
ft.inflate_fast = &inflate_fast_c;
ft.insert_string = &insert_string_c;
ft.quick_insert_string = &quick_insert_string_c;
ft.slide_hash = &slide_hash_c;
ft.update_hash = &update_hash_c;
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
ft.longest_match = &longest_match_unaligned_64;
ft.longest_match_slow = &longest_match_slow_unaligned_64;
ft.compare256 = &compare256_unaligned_64;
# elif defined(HAVE_BUILTIN_CTZ)
ft.longest_match = &longest_match_unaligned_32;
ft.longest_match_slow = &longest_match_slow_unaligned_32;
ft.compare256 = &compare256_unaligned_32;
# else
ft.longest_match = &longest_match_unaligned_16;
ft.longest_match_slow = &longest_match_slow_unaligned_16;
ft.compare256 = &compare256_unaligned_16;
# endif
#else
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
ft.compare256 = &compare256_c;
#endif
// Select arch-optimized functions
// X86 - SSE2
#ifdef X86_SSE2
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
if (cf.x86.has_sse2)
# endif
{
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
ft.chunksize = &chunksize_sse2;
ft.inflate_fast = &inflate_fast_sse2;
ft.slide_hash = &slide_hash_sse2;
# ifdef HAVE_BUILTIN_CTZ
ft.compare256 = &compare256_sse2;
ft.longest_match = &longest_match_sse2;
ft.longest_match_slow = &longest_match_slow_sse2;
# endif
}
#endif
// X86 - SSSE3
#ifdef X86_SSSE3
if (cf.x86.has_ssse3) {
ft.adler32 = &adler32_ssse3;
# ifdef X86_SSE2
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
# endif
}
#endif
// X86 - SSE4.2
#ifdef X86_SSE42
if (cf.x86.has_sse42) {
ft.adler32_fold_copy = &adler32_fold_copy_sse42;
ft.insert_string = &insert_string_sse42;
ft.quick_insert_string = &quick_insert_string_sse42;
ft.update_hash = &update_hash_sse42;
}
#endif
// X86 - PCLMUL
#ifdef X86_PCLMULQDQ_CRC
if (cf.x86.has_pclmulqdq) {
ft.crc32 = &crc32_pclmulqdq;
ft.crc32_fold = &crc32_fold_pclmulqdq;
ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
}
#endif
// X86 - AVX
#ifdef X86_AVX2
if (cf.x86.has_avx2) {
ft.adler32 = &adler32_avx2;
ft.adler32_fold_copy = &adler32_fold_copy_avx2;
ft.chunkmemset_safe = &chunkmemset_safe_avx2;
ft.chunksize = &chunksize_avx2;
ft.inflate_fast = &inflate_fast_avx2;
ft.slide_hash = &slide_hash_avx2;
# ifdef HAVE_BUILTIN_CTZ
ft.compare256 = &compare256_avx2;
ft.longest_match = &longest_match_avx2;
ft.longest_match_slow = &longest_match_slow_avx2;
# endif
}
#endif
#ifdef X86_AVX512
if (cf.x86.has_avx512) {
ft.adler32 = &adler32_avx512;
ft.adler32_fold_copy = &adler32_fold_copy_avx512;
}
#endif
#ifdef X86_AVX512VNNI
if (cf.x86.has_avx512vnni) {
ft.adler32 = &adler32_avx512_vnni;
ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
}
#endif
// X86 - VPCLMULQDQ
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
ft.crc32 = &crc32_vpclmulqdq;
ft.crc32_fold = &crc32_fold_vpclmulqdq;
ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
}
#endif
// ARM - SIMD
#ifdef ARM_SIMD
# ifndef ARM_NOCHECK_SIMD
if (cf.arm.has_simd)
# endif
{
ft.slide_hash = &slide_hash_armv6;
}
#endif
// ARM - NEON
#ifdef ARM_NEON
# ifndef ARM_NOCHECK_NEON
if (cf.arm.has_neon)
# endif
{
ft.adler32 = &adler32_neon;
ft.chunkmemset_safe = &chunkmemset_safe_neon;
ft.chunksize = &chunksize_neon;
ft.inflate_fast = &inflate_fast_neon;
ft.slide_hash = &slide_hash_neon;
# ifdef HAVE_BUILTIN_CTZLL
ft.compare256 = &compare256_neon;
ft.longest_match = &longest_match_neon;
ft.longest_match_slow = &longest_match_slow_neon;
# endif
}
#endif
// ARM - ACLE
#ifdef ARM_ACLE
if (cf.arm.has_crc32) {
ft.crc32 = &crc32_acle;
ft.insert_string = &insert_string_acle;
ft.quick_insert_string = &quick_insert_string_acle;
ft.update_hash = &update_hash_acle;
}
#endif
// Power - VMX
#ifdef PPC_VMX
if (cf.power.has_altivec) {
ft.adler32 = &adler32_vmx;
ft.slide_hash = &slide_hash_vmx;
}
#endif
// Power8 - VSX
#ifdef POWER8_VSX
if (cf.power.has_arch_2_07) {
ft.adler32 = &adler32_power8;
ft.chunkmemset_safe = &chunkmemset_safe_power8;
ft.chunksize = &chunksize_power8;
ft.inflate_fast = &inflate_fast_power8;
ft.slide_hash = &slide_hash_power8;
}
#endif
#ifdef POWER8_VSX_CRC32
if (cf.power.has_arch_2_07)
ft.crc32 = &crc32_power8;
#endif
// Power9
#ifdef POWER9
if (cf.power.has_arch_3_00) {
ft.compare256 = &compare256_power9;
ft.longest_match = &longest_match_power9;
ft.longest_match_slow = &longest_match_slow_power9;
}
#endif
// RISCV - RVV
#ifdef RISCV_RVV
if (cf.riscv.has_rvv) {
ft.adler32 = &adler32_rvv;
ft.adler32_fold_copy = &adler32_fold_copy_rvv;
ft.chunkmemset_safe = &chunkmemset_safe_rvv;
ft.chunksize = &chunksize_rvv;
ft.compare256 = &compare256_rvv;
ft.inflate_fast = &inflate_fast_rvv;
ft.longest_match = &longest_match_rvv;
ft.longest_match_slow = &longest_match_slow_rvv;
ft.slide_hash = &slide_hash_rvv;
}
#endif
// S390
#ifdef S390_CRC32_VX
if (cf.s390.has_vx)
ft.crc32 = crc32_s390_vx;
#endif
// Assign function pointers individually for atomic operation
FUNCTABLE_ASSIGN(ft, force_init);
FUNCTABLE_ASSIGN(ft, adler32);
FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
FUNCTABLE_ASSIGN(ft, chunksize);
FUNCTABLE_ASSIGN(ft, compare256);
FUNCTABLE_ASSIGN(ft, crc32);
FUNCTABLE_ASSIGN(ft, crc32_fold);
FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
FUNCTABLE_ASSIGN(ft, crc32_fold_final);
FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
FUNCTABLE_ASSIGN(ft, inflate_fast);
FUNCTABLE_ASSIGN(ft, insert_string);
FUNCTABLE_ASSIGN(ft, longest_match);
FUNCTABLE_ASSIGN(ft, longest_match_slow);
FUNCTABLE_ASSIGN(ft, quick_insert_string);
FUNCTABLE_ASSIGN(ft, slide_hash);
FUNCTABLE_ASSIGN(ft, update_hash);
// Memory barrier for weak memory order CPUs
FUNCTABLE_BARRIER();
}
/* stub functions */
static void force_init_stub(void) {
init_functable();
}
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
init_functable();
return functable.adler32(adler, buf, len);
}
static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
init_functable();
return functable.adler32_fold_copy(adler, dst, src, len);
}
static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
init_functable();
return functable.chunkmemset_safe(out, dist, len, left);
}
static uint32_t chunksize_stub(void) {
init_functable();
return functable.chunksize();
}
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
init_functable();
return functable.compare256(src0, src1);
}
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
init_functable();
return functable.crc32(crc, buf, len);
}
static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
init_functable();
functable.crc32_fold(crc, src, len, init_crc);
}
static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
init_functable();
functable.crc32_fold_copy(crc, dst, src, len);
}
static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
init_functable();
return functable.crc32_fold_final(crc);
}
static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
init_functable();
return functable.crc32_fold_reset(crc);
}
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
init_functable();
functable.inflate_fast(strm, start);
}
static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
init_functable();
functable.insert_string(s, str, count);
}
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
init_functable();
return functable.longest_match(s, cur_match);
}
static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
init_functable();
return functable.longest_match_slow(s, cur_match);
}
static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
init_functable();
return functable.quick_insert_string(s, str);
}
static void slide_hash_stub(deflate_state* s) {
init_functable();
functable.slide_hash(s);
}
static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
init_functable();
return functable.update_hash(s, h, val);
}
/* functable init */
Z_INTERNAL struct functable_s functable = {
force_init_stub,
adler32_stub,
adler32_fold_copy_stub,
chunkmemset_safe_stub,
chunksize_stub,
compare256_stub,
crc32_stub,
crc32_fold_stub,
crc32_fold_copy_stub,
crc32_fold_final_stub,
crc32_fold_reset_stub,
inflate_fast_stub,
insert_string_stub,
longest_match_stub,
longest_match_slow_stub,
quick_insert_string_stub,
slide_hash_stub,
update_hash_stub
};

@ -0,0 +1,42 @@
/* functable.h -- Struct containing function pointers to optimized functions
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef FUNCTABLE_H_
#define FUNCTABLE_H_
#include "deflate.h"
#include "crc32_fold.h"
#include "adler32_fold.h"
#ifdef ZLIB_COMPAT
typedef struct z_stream_s z_stream;
#else
typedef struct zng_stream_s zng_stream;
#endif
struct functable_s {
void (* force_init) (void);
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
uint32_t (* chunksize) (void);
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);
void (* crc32_fold) (struct crc32_fold_s *crc, const uint8_t *src, size_t len, uint32_t init_crc);
void (* crc32_fold_copy) (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t (* crc32_fold_final) (struct crc32_fold_s *crc);
uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc);
void (* inflate_fast) (PREFIX3(stream) *strm, uint32_t start);
void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count);
uint32_t (* longest_match) (deflate_state *const s, Pos cur_match);
uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
Pos (* quick_insert_string)(deflate_state *const s, uint32_t str);
void (* slide_hash) (deflate_state *s);
uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val);
};
Z_INTERNAL extern struct functable_s functable;
#endif

@ -0,0 +1,144 @@
#ifndef GZGUTS_H_
#define GZGUTS_H_
/* gzguts.h -- zlib internal header definitions for gz* operations
* Copyright (C) 2004-2019 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef _LARGEFILE64_SOURCE
# ifndef _LARGEFILE_SOURCE
# define _LARGEFILE_SOURCE 1
# endif
# undef _FILE_OFFSET_BITS
# undef _TIME_BITS
#endif
#if defined(HAVE_VISIBILITY_INTERNAL)
# define Z_INTERNAL __attribute__((visibility ("internal")))
#elif defined(HAVE_VISIBILITY_HIDDEN)
# define Z_INTERNAL __attribute__((visibility ("hidden")))
#else
# define Z_INTERNAL
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <fcntl.h>
#if defined(ZLIB_COMPAT)
# include "zlib.h"
#else
# include "zlib-ng.h"
#endif
#ifdef _WIN32
# include <stddef.h>
#endif
#if defined(_WIN32)
# include <io.h>
# define WIDECHAR
#endif
#ifdef WINAPI_FAMILY
# define open _open
# define read _read
# define write _write
# define close _close
#endif
/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
#if !defined(STDC99) && !defined(__CYGWIN__) && !defined(__MINGW__) && defined(_WIN32)
# if !defined(vsnprintf)
# if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
# define vsnprintf _vsnprintf
# endif
# endif
#endif
/* unlike snprintf (which is required in C99), _snprintf does not guarantee
null termination of the result -- however this is only used in gzlib.c
where the result is assured to fit in the space provided */
#if defined(_MSC_VER) && _MSC_VER < 1900
# define snprintf _snprintf
#endif
/* get errno and strerror definition */
#ifndef NO_STRERROR
# include <errno.h>
# define zstrerror() strerror(errno)
#else
# define zstrerror() "stdio error (consult errno)"
#endif
/* default memLevel */
#if MAX_MEM_LEVEL >= 8
# define DEF_MEM_LEVEL 8
#else
# define DEF_MEM_LEVEL MAX_MEM_LEVEL
#endif
/* default i/o buffer size -- double this for output when reading (this and
twice this must be able to fit in an unsigned type) */
#ifndef GZBUFSIZE
# define GZBUFSIZE 131072
#endif
/* gzip modes, also provide a little integrity check on the passed structure */
#define GZ_NONE 0
#define GZ_READ 7247
#define GZ_WRITE 31153
#define GZ_APPEND 1 /* mode set to GZ_WRITE after the file is opened */
/* values for gz_state how */
#define LOOK 0 /* look for a gzip header */
#define COPY 1 /* copy input directly */
#define GZIP 2 /* decompress a gzip stream */
/* internal gzip file state data structure */
typedef struct {
/* exposed contents for gzgetc() macro */
struct gzFile_s x; /* "x" for exposed */
/* x.have: number of bytes available at x.next */
/* x.next: next output data to deliver or write */
/* x.pos: current position in uncompressed data */
/* used for both reading and writing */
int mode; /* see gzip modes above */
int fd; /* file descriptor */
char *path; /* path or fd for error messages */
unsigned size; /* buffer size, zero if not allocated yet */
unsigned want; /* requested buffer size, default is GZBUFSIZE */
unsigned char *in; /* input buffer (double-sized when writing) */
unsigned char *out; /* output buffer (double-sized when reading) */
int direct; /* 0 if processing gzip, 1 if transparent */
/* just for reading */
int how; /* 0: get header, 1: copy, 2: decompress */
z_off64_t start; /* where the gzip data started, for rewinding */
int eof; /* true if end of input file reached */
int past; /* true if read requested past end */
/* just for writing */
int level; /* compression level */
int strategy; /* compression strategy */
int reset; /* true if a reset is pending after a Z_FINISH */
/* seek request */
z_off64_t skip; /* amount to skip (already rewound if backwards) */
int seek; /* true if seek request pending */
/* error information */
int err; /* error code */
char *msg; /* error message */
/* zlib inflate or deflate stream */
PREFIX3(stream) strm; /* stream structure in-place (not a pointer) */
} gz_state;
typedef gz_state *gz_statep;
/* shared functions */
void Z_INTERNAL gz_error(gz_state *, int, const char *);
/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
value -- needed when comparing unsigned to z_off64_t, which is signed
(possible z_off64_t types off_t, off64_t, and long are all signed) */
#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
#endif /* GZGUTS_H_ */

@ -0,0 +1,525 @@
/* gzlib.c -- zlib functions common to reading and writing gzip files
* Copyright (C) 2004-2019 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil_p.h"
#include "gzguts.h"
#if defined(_WIN32)
# define LSEEK _lseeki64
#else
#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
# define LSEEK lseek64
#else
# define LSEEK lseek
#endif
#endif
/* Local functions */
static void gz_reset(gz_state *);
static gzFile gz_open(const void *, int, const char *);
/* Reset gzip file state */
static void gz_reset(gz_state *state) {
state->x.have = 0; /* no output data available */
if (state->mode == GZ_READ) { /* for reading ... */
state->eof = 0; /* not at end of file */
state->past = 0; /* have not read past end yet */
state->how = LOOK; /* look for gzip header */
}
else /* for writing ... */
state->reset = 0; /* no deflateReset pending */
state->seek = 0; /* no seek request pending */
gz_error(state, Z_OK, NULL); /* clear error */
state->x.pos = 0; /* no uncompressed data yet */
state->strm.avail_in = 0; /* no input data yet */
}
/* Open a gzip file either by name or file descriptor. */
static gzFile gz_open(const void *path, int fd, const char *mode) {
gz_state *state;
size_t len;
int oflag;
#ifdef O_CLOEXEC
int cloexec = 0;
#endif
#ifdef O_EXCL
int exclusive = 0;
#endif
/* check input */
if (path == NULL)
return NULL;
/* allocate gzFile structure to return */
state = (gz_state *)zng_alloc(sizeof(gz_state));
if (state == NULL)
return NULL;
state->size = 0; /* no buffers allocated yet */
state->want = GZBUFSIZE; /* requested buffer size */
state->msg = NULL; /* no error message yet */
/* interpret mode */
state->mode = GZ_NONE;
state->level = Z_DEFAULT_COMPRESSION;
state->strategy = Z_DEFAULT_STRATEGY;
state->direct = 0;
while (*mode) {
if (*mode >= '0' && *mode <= '9') {
state->level = *mode - '0';
} else {
switch (*mode) {
case 'r':
state->mode = GZ_READ;
break;
#ifndef NO_GZCOMPRESS
case 'w':
state->mode = GZ_WRITE;
break;
case 'a':
state->mode = GZ_APPEND;
break;
#endif
case '+': /* can't read and write at the same time */
zng_free(state);
return NULL;
case 'b': /* ignore -- will request binary anyway */
break;
#ifdef O_CLOEXEC
case 'e':
cloexec = 1;
break;
#endif
#ifdef O_EXCL
case 'x':
exclusive = 1;
break;
#endif
case 'f':
state->strategy = Z_FILTERED;
break;
case 'h':
state->strategy = Z_HUFFMAN_ONLY;
break;
case 'R':
state->strategy = Z_RLE;
break;
case 'F':
state->strategy = Z_FIXED;
break;
case 'T':
state->direct = 1;
break;
default: /* could consider as an error, but just ignore */
{}
}
}
mode++;
}
/* must provide an "r", "w", or "a" */
if (state->mode == GZ_NONE) {
zng_free(state);
return NULL;
}
/* can't force transparent read */
if (state->mode == GZ_READ) {
if (state->direct) {
zng_free(state);
return NULL;
}
state->direct = 1; /* for empty file */
}
/* save the path name for error messages */
#ifdef WIDECHAR
if (fd == -2) {
len = wcstombs(NULL, (const wchar_t *)path, 0);
if (len == (size_t)-1)
len = 0;
} else
#endif
len = strlen((const char *)path);
state->path = (char *)malloc(len + 1);
if (state->path == NULL) {
zng_free(state);
return NULL;
}
#ifdef WIDECHAR
if (fd == -2)
if (len) {
wcstombs(state->path, (const wchar_t *)path, len + 1);
} else {
*(state->path) = 0;
}
else
#endif
(void)snprintf(state->path, len + 1, "%s", (const char *)path);
/* compute the flags for open() */
oflag =
#ifdef O_LARGEFILE
O_LARGEFILE |
#endif
#ifdef O_BINARY
O_BINARY |
#endif
#ifdef O_CLOEXEC
(cloexec ? O_CLOEXEC : 0) |
#endif
(state->mode == GZ_READ ?
O_RDONLY :
(O_WRONLY | O_CREAT |
#ifdef O_EXCL
(exclusive ? O_EXCL : 0) |
#endif
(state->mode == GZ_WRITE ?
O_TRUNC :
O_APPEND)));
/* open the file with the appropriate flags (or just use fd) */
state->fd = fd > -1 ? fd : (
#if defined(_WIN32)
fd == -2 ? _wopen((const wchar_t *)path, oflag, 0666) :
#elif __CYGWIN__
fd == -2 ? open(state->path, oflag, 0666) :
#endif
open((const char *)path, oflag, 0666));
if (state->fd == -1) {
free(state->path);
zng_free(state);
return NULL;
}
if (state->mode == GZ_APPEND) {
LSEEK(state->fd, 0, SEEK_END); /* so gzoffset() is correct */
state->mode = GZ_WRITE; /* simplify later checks */
}
/* save the current position for rewinding (only if reading) */
if (state->mode == GZ_READ) {
state->start = LSEEK(state->fd, 0, SEEK_CUR);
if (state->start == -1) state->start = 0;
}
/* initialize stream */
gz_reset(state);
/* return stream */
return (gzFile)state;
}
/* -- see zlib.h -- */
gzFile Z_EXPORT PREFIX(gzopen)(const char *path, const char *mode) {
return gz_open(path, -1, mode);
}
#ifdef ZLIB_COMPAT
gzFile Z_EXPORT PREFIX4(gzopen)(const char *path, const char *mode) {
return gz_open(path, -1, mode);
}
#endif
/* -- see zlib.h -- */
gzFile Z_EXPORT PREFIX(gzdopen)(int fd, const char *mode) {
char *path; /* identifier for error messages */
gzFile gz;
if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
return NULL;
(void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd); /* for debugging */
gz = gz_open(path, fd, mode);
free(path);
return gz;
}
/* -- see zlib.h -- */
#ifdef WIDECHAR
gzFile Z_EXPORT PREFIX(gzopen_w)(const wchar_t *path, const char *mode) {
return gz_open(path, -2, mode);
}
#endif
int Z_EXPORT PREFIX(gzclose)(gzFile file) {
#ifndef NO_GZCOMPRESS
gz_state *state;
if (file == NULL)
return Z_STREAM_ERROR;
state = (gz_state *)file;
return state->mode == GZ_READ ? PREFIX(gzclose_r)(file) : PREFIX(gzclose_w)(file);
#else
return PREFIX(gzclose_r)(file);
#endif
}
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzbuffer)(gzFile file, unsigned size) {
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return -1;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return -1;
/* make sure we haven't already allocated memory */
if (state->size != 0)
return -1;
/* check and set requested size */
if ((size << 1) < size)
return -1; /* need to be able to double it */
if (size < 8)
size = 8; /* needed to behave well with flushing */
state->want = size;
return 0;
}
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzrewind)(gzFile file) {
gz_state *state;
/* get internal structure */
if (file == NULL)
return -1;
state = (gz_state *)file;
/* check that we're reading and that there's no error */
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
return -1;
/* back up and start over */
if (LSEEK(state->fd, state->start, SEEK_SET) == -1)
return -1;
gz_reset(state);
return 0;
}
/* -- see zlib.h -- */
z_off64_t Z_EXPORT PREFIX4(gzseek)(gzFile file, z_off64_t offset, int whence) {
unsigned n;
z_off64_t ret;
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return -1;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return -1;
/* check that there's no error */
if (state->err != Z_OK && state->err != Z_BUF_ERROR)
return -1;
/* can only seek from start or relative to current position */
if (whence != SEEK_SET && whence != SEEK_CUR)
return -1;
/* normalize offset to a SEEK_CUR specification */
if (whence == SEEK_SET)
offset -= state->x.pos;
else if (state->seek)
offset += state->skip;
state->seek = 0;
/* if within raw area while reading, just go there */
if (state->mode == GZ_READ && state->how == COPY && state->x.pos + offset >= 0) {
ret = LSEEK(state->fd, offset - (z_off64_t)state->x.have, SEEK_CUR);
if (ret == -1)
return -1;
state->x.have = 0;
state->eof = 0;
state->past = 0;
state->seek = 0;
gz_error(state, Z_OK, NULL);
state->strm.avail_in = 0;
state->x.pos += offset;
return state->x.pos;
}
/* calculate skip amount, rewinding if needed for back seek when reading */
if (offset < 0) {
if (state->mode != GZ_READ) /* writing -- can't go backwards */
return -1;
offset += state->x.pos;
if (offset < 0) /* before start of file! */
return -1;
if (PREFIX(gzrewind)(file) == -1) /* rewind, then skip to offset */
return -1;
}
/* if reading, skip what's in output buffer (one less gzgetc() check) */
if (state->mode == GZ_READ) {
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ? (unsigned)offset : state->x.have;
state->x.have -= n;
state->x.next += n;
state->x.pos += n;
offset -= n;
}
/* request skip (if not zero) */
if (offset) {
state->seek = 1;
state->skip = offset;
}
return state->x.pos + offset;
}
/* -- see zlib.h -- */
#ifdef ZLIB_COMPAT
z_off_t Z_EXPORT PREFIX(gzseek)(gzFile file, z_off_t offset, int whence) {
z_off64_t ret;
ret = PREFIX4(gzseek)(file, (z_off64_t)offset, whence);
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
}
#endif
/* -- see zlib.h -- */
z_off64_t Z_EXPORT PREFIX4(gztell)(gzFile file) {
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return -1;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return -1;
/* return position */
return state->x.pos + (state->seek ? state->skip : 0);
}
/* -- see zlib.h -- */
#ifdef ZLIB_COMPAT
z_off_t Z_EXPORT PREFIX(gztell)(gzFile file) {
z_off64_t ret;
ret = PREFIX4(gztell)(file);
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
}
#endif
/* -- see zlib.h -- */
z_off64_t Z_EXPORT PREFIX4(gzoffset)(gzFile file) {
z_off64_t offset;
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return -1;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return -1;
/* compute and return effective offset in file */
offset = LSEEK(state->fd, 0, SEEK_CUR);
if (offset == -1)
return -1;
if (state->mode == GZ_READ) /* reading */
offset -= state->strm.avail_in; /* don't count buffered input */
return offset;
}
/* -- see zlib.h -- */
#ifdef ZLIB_COMPAT
z_off_t Z_EXPORT PREFIX(gzoffset)(gzFile file) {
z_off64_t ret;
ret = PREFIX4(gzoffset)(file);
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
}
#endif
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzeof)(gzFile file) {
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return 0;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return 0;
/* return end-of-file state */
return state->mode == GZ_READ ? state->past : 0;
}
/* -- see zlib.h -- */
const char * Z_EXPORT PREFIX(gzerror)(gzFile file, int *errnum) {
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return NULL;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return NULL;
/* return error information */
if (errnum != NULL)
*errnum = state->err;
return state->err == Z_MEM_ERROR ? "out of memory" : (state->msg == NULL ? "" : state->msg);
}
/* -- see zlib.h -- */
void Z_EXPORT PREFIX(gzclearerr)(gzFile file) {
gz_state *state;
/* get internal structure and check integrity */
if (file == NULL)
return;
state = (gz_state *)file;
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
return;
/* clear error and end-of-file */
if (state->mode == GZ_READ) {
state->eof = 0;
state->past = 0;
}
gz_error(state, Z_OK, NULL);
}
/* Create an error message in allocated memory and set state->err and
state->msg accordingly. Free any previous error message already there. Do
not try to free or allocate space if the error is Z_MEM_ERROR (out of
memory). Simply save the error message as a static string. If there is an
allocation failure constructing the error message, then convert the error to
out of memory. */
void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) {
/* free previously allocated message and clear */
if (state->msg != NULL) {
if (state->err != Z_MEM_ERROR)
free(state->msg);
state->msg = NULL;
}
/* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */
if (err != Z_OK && err != Z_BUF_ERROR)
state->x.have = 0;
/* set error code, and if no message, then done */
state->err = err;
if (msg == NULL)
return;
/* for an out of memory error, return literal string when requested */
if (err == Z_MEM_ERROR)
return;
/* construct error message with path */
if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) == NULL) {
state->err = Z_MEM_ERROR;
return;
}
(void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg);
}

@ -0,0 +1,606 @@
/* gzread.c -- zlib functions for reading gzip files
* Copyright (C) 2004-2017 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil_p.h"
#include "gzguts.h"
/* Local functions */
static int gz_load(gz_state *, unsigned char *, unsigned, unsigned *);
static int gz_avail(gz_state *);
static int gz_look(gz_state *);
static int gz_decomp(gz_state *);
static int gz_fetch(gz_state *);
static int gz_skip(gz_state *, z_off64_t);
static size_t gz_read(gz_state *, void *, size_t);
/* Use read() to load a buffer -- return -1 on error, otherwise 0. Read from
state->fd, and update state->eof, state->err, and state->msg as appropriate.
This function needs to loop on read(), since read() is not guaranteed to
read the number of bytes requested, depending on the type of descriptor. */
static int gz_load(gz_state *state, unsigned char *buf, unsigned len, unsigned *have) {
ssize_t ret;
*have = 0;
do {
ret = read(state->fd, buf + *have, len - *have);
if (ret <= 0)
break;
*have += (unsigned)ret;
} while (*have < len);
if (ret < 0) {
gz_error(state, Z_ERRNO, zstrerror());
return -1;
}
if (ret == 0)
state->eof = 1;
return 0;
}
/* Load up input buffer and set eof flag if last data loaded -- return -1 on
error, 0 otherwise. Note that the eof flag is set when the end of the input
file is reached, even though there may be unused data in the buffer. Once
that data has been used, no more attempts will be made to read the file.
If strm->avail_in != 0, then the current data is moved to the beginning of
the input buffer, and then the remainder of the buffer is loaded with the
available data from the input file. */
static int gz_avail(gz_state *state) {
unsigned got;
PREFIX3(stream) *strm = &(state->strm);
if (state->err != Z_OK && state->err != Z_BUF_ERROR)
return -1;
if (state->eof == 0) {
if (strm->avail_in) { /* copy what's there to the start */
unsigned char *p = state->in;
unsigned const char *q = strm->next_in;
unsigned n = strm->avail_in;
do {
*p++ = *q++;
} while (--n);
}
if (gz_load(state, state->in + strm->avail_in, state->size - strm->avail_in, &got) == -1)
return -1;
strm->avail_in += got;
strm->next_in = state->in;
}
return 0;
}
/* Look for gzip header, set up for inflate or copy. state->x.have must be 0.
If this is the first time in, allocate required memory. state->how will be
left unchanged if there is no more input data available, will be set to COPY
if there is no gzip header and direct copying will be performed, or it will
be set to GZIP for decompression. If direct copying, then leftover input
data from the input buffer will be copied to the output buffer. In that
case, all further file reads will be directly to either the output buffer or
a user buffer. If decompressing, the inflate state will be initialized.
gz_look() will return 0 on success or -1 on failure. */
static int gz_look(gz_state *state) {
PREFIX3(stream) *strm = &(state->strm);
/* allocate read buffers and inflate memory */
if (state->size == 0) {
/* allocate buffers */
state->in = (unsigned char *)zng_alloc(state->want);
state->out = (unsigned char *)zng_alloc(state->want << 1);
if (state->in == NULL || state->out == NULL) {
zng_free(state->out);
zng_free(state->in);
gz_error(state, Z_MEM_ERROR, "out of memory");
return -1;
}
state->size = state->want;
/* allocate inflate memory */
state->strm.zalloc = NULL;
state->strm.zfree = NULL;
state->strm.opaque = NULL;
state->strm.avail_in = 0;
state->strm.next_in = NULL;
if (PREFIX(inflateInit2)(&(state->strm), MAX_WBITS + 16) != Z_OK) { /* gunzip */
zng_free(state->out);
zng_free(state->in);
state->size = 0;
gz_error(state, Z_MEM_ERROR, "out of memory");
return -1;
}
}
/* get at least the magic bytes in the input buffer */
if (strm->avail_in < 2) {
if (gz_avail(state) == -1)
return -1;
if (strm->avail_in == 0)
return 0;
}
/* look for gzip magic bytes -- if there, do gzip decoding (note: there is
a logical dilemma here when considering the case of a partially written
gzip file, to wit, if a single 31 byte is written, then we cannot tell
whether this is a single-byte file, or just a partially written gzip
file -- for here we assume that if a gzip file is being written, then
the header will be written in a single operation, so that reading a
single byte is sufficient indication that it is not a gzip file) */
if (strm->avail_in > 1 &&
strm->next_in[0] == 31 && strm->next_in[1] == 139) {
PREFIX(inflateReset)(strm);
state->how = GZIP;
state->direct = 0;
return 0;
}
/* no gzip header -- if we were decoding gzip before, then this is trailing
garbage. Ignore the trailing garbage and finish. */
if (state->direct == 0) {
strm->avail_in = 0;
state->eof = 1;
state->x.have = 0;
return 0;
}
/* doing raw i/o, copy any leftover input to output -- this assumes that
the output buffer is larger than the input buffer, which also assures
space for gzungetc() */
state->x.next = state->out;
memcpy(state->x.next, strm->next_in, strm->avail_in);
state->x.have = strm->avail_in;
strm->avail_in = 0;
state->how = COPY;
state->direct = 1;
return 0;
}
/* Decompress from input to the provided next_out and avail_out in the state.
On return, state->x.have and state->x.next point to the just decompressed
data. If the gzip stream completes, state->how is reset to LOOK to look for
the next gzip stream or raw data, once state->x.have is depleted. Returns 0
on success, -1 on failure. */
static int gz_decomp(gz_state *state) {
int ret = Z_OK;
unsigned had;
PREFIX3(stream) *strm = &(state->strm);
/* fill output buffer up to end of deflate stream */
had = strm->avail_out;
do {
/* get more input for inflate() */
if (strm->avail_in == 0 && gz_avail(state) == -1)
return -1;
if (strm->avail_in == 0) {
gz_error(state, Z_BUF_ERROR, "unexpected end of file");
break;
}
/* decompress and handle errors */
ret = PREFIX(inflate)(strm, Z_NO_FLUSH);
if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
gz_error(state, Z_STREAM_ERROR, "internal error: inflate stream corrupt");
return -1;
}
if (ret == Z_MEM_ERROR) {
gz_error(state, Z_MEM_ERROR, "out of memory");
return -1;
}
if (ret == Z_DATA_ERROR) { /* deflate stream invalid */
gz_error(state, Z_DATA_ERROR, strm->msg == NULL ? "compressed data error" : strm->msg);
return -1;
}
} while (strm->avail_out && ret != Z_STREAM_END);
/* update available output */
state->x.have = had - strm->avail_out;
state->x.next = strm->next_out - state->x.have;
/* if the gzip stream completed successfully, look for another */
if (ret == Z_STREAM_END)
state->how = LOOK;
/* good decompression */
return 0;
}
/* Fetch data and put it in the output buffer. Assumes state->x.have is 0.
Data is either copied from the input file or decompressed from the input
file depending on state->how. If state->how is LOOK, then a gzip header is
looked for to determine whether to copy or decompress. Returns -1 on error,
otherwise 0. gz_fetch() will leave state->how as COPY or GZIP unless the
end of the input file has been reached and all data has been processed. */
static int gz_fetch(gz_state *state) {
PREFIX3(stream) *strm = &(state->strm);
do {
switch (state->how) {
case LOOK: /* -> LOOK, COPY (only if never GZIP), or GZIP */
if (gz_look(state) == -1)
return -1;
if (state->how == LOOK)
return 0;
break;
case COPY: /* -> COPY */
if (gz_load(state, state->out, state->size << 1, &(state->x.have))
== -1)
return -1;
state->x.next = state->out;
return 0;
case GZIP: /* -> GZIP or LOOK (if end of gzip stream) */
strm->avail_out = state->size << 1;
strm->next_out = state->out;
if (gz_decomp(state) == -1)
return -1;
}
} while (state->x.have == 0 && (!state->eof || strm->avail_in));
return 0;
}
/* Skip len uncompressed bytes of output. Return -1 on error, 0 on success. */
static int gz_skip(gz_state *state, z_off64_t len) {
unsigned n;
/* skip over len bytes or reach end-of-file, whichever comes first */
while (len)
/* skip over whatever is in output buffer */
if (state->x.have) {
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ?
(unsigned)len : state->x.have;
state->x.have -= n;
state->x.next += n;
state->x.pos += n;
len -= n;
} else if (state->eof && state->strm.avail_in == 0) {
/* output buffer empty -- return if we're at the end of the input */
break;
} else {
/* need more data to skip -- load up output buffer */
/* get more output, looking for header if required */
if (gz_fetch(state) == -1)
return -1;
}
return 0;
}
/* Read len bytes into buf from file, or less than len up to the end of the
input. Return the number of bytes read. If zero is returned, either the
end of file was reached, or there was an error. state->err must be
consulted in that case to determine which. */
static size_t gz_read(gz_state *state, void *buf, size_t len) {
size_t got;
unsigned n;
/* if len is zero, avoid unnecessary operations */
if (len == 0)
return 0;
/* process a skip request */
if (state->seek) {
state->seek = 0;
if (gz_skip(state, state->skip) == -1)
return 0;
}
/* get len bytes to buf, or less than len if at the end */
got = 0;
do {
/* set n to the maximum amount of len that fits in an unsigned int */
n = (unsigned)-1;
if (n > len)
n = (unsigned)len;
/* first just try copying data from the output buffer */
if (state->x.have) {
if (state->x.have < n)
n = state->x.have;
memcpy(buf, state->x.next, n);
state->x.next += n;
state->x.have -= n;
}
/* output buffer empty -- return if we're at the end of the input */
else if (state->eof && state->strm.avail_in == 0) {
state->past = 1; /* tried to read past end */
break;
}
/* need output data -- for small len or new stream load up our output
buffer */
else if (state->how == LOOK || n < (state->size << 1)) {
/* get more output, looking for header if required */
if (gz_fetch(state) == -1)
return 0;
continue; /* no progress yet -- go back to copy above */
/* the copy above assures that we will leave with space in the
output buffer, allowing at least one gzungetc() to succeed */
}
/* large len -- read directly into user buffer */
else if (state->how == COPY) { /* read directly */
if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
return 0;
}
/* large len -- decompress directly into user buffer */
else { /* state->how == GZIP */
state->strm.avail_out = n;
state->strm.next_out = (unsigned char *)buf;
if (gz_decomp(state) == -1)
return 0;
n = state->x.have;
state->x.have = 0;
}
/* update progress */
len -= n;
buf = (char *)buf + n;
got += n;
state->x.pos += n;
} while (len);
/* return number of bytes read into user buffer */
return got;
}
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzread)(gzFile file, void *buf, unsigned len) {
gz_state *state;
/* get internal structure */
if (file == NULL)
return -1;
state = (gz_state *)file;
/* check that we're reading and that there's no (serious) error */
if (state->mode != GZ_READ ||
(state->err != Z_OK && state->err != Z_BUF_ERROR))
return -1;
/* since an int is returned, make sure len fits in one, otherwise return
with an error (this avoids a flaw in the interface) */
if ((int)len < 0) {
gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
return -1;
}
/* read len or fewer bytes to buf */
len = (unsigned)gz_read(state, buf, len);
/* check for an error */
if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR)
return -1;
/* return the number of bytes read (this is assured to fit in an int) */
return (int)len;
}
/* -- see zlib.h -- */
size_t Z_EXPORT PREFIX(gzfread)(void *buf, size_t size, size_t nitems, gzFile file) {
size_t len;
gz_state *state;
/* Exit early if size is zero, also prevents potential division by zero */
if (size == 0)
return 0;
/* get internal structure */
if (file == NULL)
return 0;
state = (gz_state *)file;
/* check that we're reading and that there's no (serious) error */
if (state->mode != GZ_READ ||
(state->err != Z_OK && state->err != Z_BUF_ERROR))
return 0;
/* compute bytes to read -- error on overflow */
if (size && SIZE_MAX / size < nitems) {
gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
return 0;
}
len = nitems * size;
/* read len or fewer bytes to buf, return the number of full items read */
return len ? gz_read(state, buf, len) / size : 0;
}
/* -- see zlib.h -- */
#undef @ZLIB_SYMBOL_PREFIX@gzgetc
#undef @ZLIB_SYMBOL_PREFIX@zng_gzgetc
int Z_EXPORT PREFIX(gzgetc)(gzFile file) {
unsigned char buf[1];
gz_state *state;
/* get internal structure */
if (file == NULL)
return -1;
state = (gz_state *)file;
/* check that we're reading and that there's no (serious) error */
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
return -1;
/* try output buffer (no need to check for skip request) */
if (state->x.have) {
state->x.have--;
state->x.pos++;
return *(state->x.next)++;
}
/* nothing there -- try gz_read() */
return gz_read(state, buf, 1) < 1 ? -1 : buf[0];
}
#ifdef ZLIB_COMPAT
int Z_EXPORT PREFIX(gzgetc_)(gzFile file) {
return PREFIX(gzgetc)(file);
}
#endif
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzungetc)(int c, gzFile file) {
gz_state *state;
/* get internal structure */
if (file == NULL)
return -1;
state = (gz_state *)file;
/* in case this was just opened, set up the input buffer */
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
(void)gz_look(state);
/* check that we're reading and that there's no (serious) error */
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
return -1;
/* process a skip request */
if (state->seek) {
state->seek = 0;
if (gz_skip(state, state->skip) == -1)
return -1;
}
/* can't push EOF */
if (c < 0)
return -1;
/* if output buffer empty, put byte at end (allows more pushing) */
if (state->x.have == 0) {
state->x.have = 1;
state->x.next = state->out + (state->size << 1) - 1;
state->x.next[0] = (unsigned char)c;
state->x.pos--;
state->past = 0;
return c;
}
/* if no room, give up (must have already done a gzungetc()) */
if (state->x.have == (state->size << 1)) {
gz_error(state, Z_DATA_ERROR, "out of room to push characters");
return -1;
}
/* slide output data if needed and insert byte before existing data */
if (state->x.next == state->out) {
unsigned char *src = state->out + state->x.have;
unsigned char *dest = state->out + (state->size << 1);
while (src > state->out)
*--dest = *--src;
state->x.next = dest;
}
state->x.have++;
state->x.next--;
state->x.next[0] = (unsigned char)c;
state->x.pos--;
state->past = 0;
return c;
}
/* -- see zlib.h -- */
char * Z_EXPORT PREFIX(gzgets)(gzFile file, char *buf, int len) {
unsigned left, n;
char *str;
unsigned char *eol;
gz_state *state;
/* check parameters and get internal structure */
if (file == NULL || buf == NULL || len < 1)
return NULL;
state = (gz_state *)file;
/* check that we're reading and that there's no (serious) error */
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
return NULL;
/* process a skip request */
if (state->seek) {
state->seek = 0;
if (gz_skip(state, state->skip) == -1)
return NULL;
}
/* copy output bytes up to new line or len - 1, whichever comes first --
append a terminating zero to the string (we don't check for a zero in
the contents, let the user worry about that) */
str = buf;
left = (unsigned)len - 1;
if (left) {
do {
/* assure that something is in the output buffer */
if (state->x.have == 0 && gz_fetch(state) == -1)
return NULL; /* error */
if (state->x.have == 0) { /* end of file */
state->past = 1; /* read past end */
break; /* return what we have */
}
/* look for end-of-line in current output buffer */
n = state->x.have > left ? left : state->x.have;
eol = (unsigned char *)memchr(state->x.next, '\n', n);
if (eol != NULL)
n = (unsigned)(eol - state->x.next) + 1;
/* copy through end-of-line, or remainder if not found */
memcpy(buf, state->x.next, n);
state->x.have -= n;
state->x.next += n;
state->x.pos += n;
left -= n;
buf += n;
} while (left && eol == NULL);
}
/* return terminated string, or if nothing, end of file */
if (buf == str)
return NULL;
buf[0] = 0;
return str;
}
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzdirect)(gzFile file) {
gz_state *state;
/* get internal structure */
if (file == NULL)
return 0;
state = (gz_state *)file;
/* if the state is not known, but we can find out, then do so (this is
mainly for right after a gzopen() or gzdopen()) */
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
(void)gz_look(state);
/* return 1 if transparent, 0 if processing a gzip stream */
return state->direct;
}
/* -- see zlib.h -- */
int Z_EXPORT PREFIX(gzclose_r)(gzFile file) {
int ret, err;
gz_state *state;
/* get internal structure */
if (file == NULL)
return Z_STREAM_ERROR;
state = (gz_state *)file;
/* check that we're reading */
if (state->mode != GZ_READ)
return Z_STREAM_ERROR;
/* free memory and close file */
if (state->size) {
PREFIX(inflateEnd)(&(state->strm));
zng_free(state->out);
zng_free(state->in);
}
err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
gz_error(state, Z_OK, NULL);
free(state->path);
ret = close(state->fd);
zng_free(state);
return ret ? Z_ERRNO : err;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save