mirror of https://github.com/opencv/opencv.git
Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573pull/24782/head
parent
e80b7940ef
commit
0de26fd78e
129 changed files with 31910 additions and 13 deletions
@ -0,0 +1,796 @@ |
||||
project(${ZLIB_LIBRARY} LANGUAGES C) |
||||
|
||||
if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES) |
||||
set(CMAKE_C_STANDARD 11) # The C standard whose features are requested to build this target |
||||
else() |
||||
set(CMAKE_C_STANDARD 99) |
||||
endif() |
||||
set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement |
||||
set(CMAKE_C_EXTENSIONS OFF) # Boolean specifying whether compiler specific extensions are requested |
||||
|
||||
include(CheckTypeSize) |
||||
include(CheckSymbolExists) |
||||
include(CheckFunctionExists) |
||||
include(CheckIncludeFile) |
||||
include(CheckCSourceCompiles) |
||||
include(CheckCSourceRuns) |
||||
include(CheckCCompilerFlag) |
||||
include(CMakeDependentOption) |
||||
|
||||
if(X86_64 OR X86) |
||||
set(BASEARCH_X86_FOUND TRUE) |
||||
endif() |
||||
if(AARCH64 OR ARM) |
||||
set(BASEARCH_ARM_FOUND TRUE) |
||||
endif() |
||||
if(PPC64LE OR PPC64) |
||||
set(BASEARCH_PPC_FOUND TRUE) |
||||
endif() |
||||
if(RISCV) |
||||
set(BASEARCH_RISCV_FOUND TRUE) |
||||
endif() |
||||
|
||||
include(cmake/detect-intrinsics.cmake) |
||||
include(cmake/fallback-macros.cmake) |
||||
|
||||
set(ZLIB_SYMBOL_PREFIX "") |
||||
|
||||
if(BASEARCH_X86_FOUND) |
||||
set(WITH_AVX2 ON) |
||||
set(WITH_AVX512 ON) |
||||
set(WITH_AVX512VNNI ON) |
||||
set(WITH_SSE2 ON) |
||||
set(WITH_SSSE3 ON) |
||||
set(WITH_SSE42 ON) |
||||
set(WITH_PCLMULQDQ ON) |
||||
set(WITH_VPCLMULQDQ ON) |
||||
endif() |
||||
if(BASEARCH_ARM_FOUND) |
||||
set(WITH_ACLE ON) |
||||
set(WITH_NEON ON) |
||||
if(ARM) |
||||
set(WITH_ARMV6 ON) |
||||
else() |
||||
set(WITH_ARMV6 OFF) |
||||
endif() |
||||
endif() |
||||
if(BASEARCH_PPC_FOUND) |
||||
set(WITH_ALTIVEC ON) |
||||
set(WITH_POWER8 ON) |
||||
set(WITH_POWER9 ON) |
||||
endif() |
||||
if(BASEARCH_RISCV_FOUND) |
||||
set(WITH_RVV ON) |
||||
endif() |
||||
|
||||
|
||||
add_definitions(-DZLIB_COMPAT) |
||||
|
||||
add_definitions(-DWITH_GZFILEOP) |
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "^Intel") |
||||
set(WARNFLAGS_DISABLE) |
||||
elseif(MSVC) |
||||
# Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013 |
||||
# See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html |
||||
if(MSVC_VERSION VERSION_LESS 1800) |
||||
message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).") |
||||
endif() |
||||
# TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination |
||||
# (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should |
||||
# avoid mistakes. |
||||
# /Oi ? |
||||
set(WARNFLAGS_DISABLE) |
||||
if(BASEARCH_ARM_FOUND) |
||||
add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE) |
||||
if(NOT "${ARCH}" MATCHES "aarch64") |
||||
set(NEONFLAG "/arch:VFPv4") |
||||
endif() |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
set(WARNFLAGS_DISABLE) |
||||
# Check whether -fno-lto is available |
||||
set(CMAKE_REQUIRED_FLAGS "-fno-lto") |
||||
check_c_source_compiles( |
||||
"int main() { return 0; }" |
||||
FNO_LTO_AVAILABLE FAIL_REGEX "not supported") |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
if(FNO_LTO_AVAILABLE) |
||||
set(ZNOLTOFLAG "-fno-lto") |
||||
endif() |
||||
if(BASEARCH_ARM_FOUND) |
||||
if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi") |
||||
# Auto-detect support for ARM floating point ABI |
||||
check_include_file(features.h HAVE_FEATURES_H) |
||||
if(HAVE_FEATURES_H) |
||||
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp) |
||||
check_c_source_compiles( |
||||
"#include <features.h> |
||||
int main() { return 0; }" |
||||
HAVE_FLOATABI_SOFTFP) |
||||
if(HAVE_FLOATABI_SOFTFP) |
||||
set(FLOATABI -mfloat-abi=softfp) |
||||
else() |
||||
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard) |
||||
check_c_source_compiles( |
||||
"#include <features.h> |
||||
int main() { return 0; }" |
||||
HAVE_FLOATABI_HARD) |
||||
if(HAVE_FLOATABI_HARD) |
||||
set(FLOATABI -mfloat-abi=hard) |
||||
endif() |
||||
endif() |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endif() |
||||
if(FLOATABI) |
||||
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}") |
||||
add_compile_options(${FLOATABI}) |
||||
else() |
||||
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
if(FNO_LTO_AVAILABLE) |
||||
set(NOLTOFLAG ${ZNOLTOFLAG}) |
||||
endif() |
||||
if(MINGW) |
||||
# Add `-Wno-pedantic-ms-format` only if the toolchain supports it |
||||
check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT) |
||||
if(HAVE_NO_PEDANTIC_MS_FORMAT) |
||||
list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format) |
||||
endif() |
||||
endif() |
||||
endif() |
||||
|
||||
# Force disable LTO |
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) |
||||
|
||||
# Apply warning compiler flags |
||||
add_compile_options(${WARNFLAGS_DISABLE}) |
||||
|
||||
# Replace optimization level 3 added by default with level 2 |
||||
if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3") |
||||
string(REGEX REPLACE "([\\/\\-]O)3" "\\12" |
||||
CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") |
||||
endif() |
||||
|
||||
# |
||||
# Check for standard/system includes |
||||
# |
||||
check_include_file(arm_acle.h HAVE_ARM_ACLE_H) |
||||
if(HAVE_ARM_ACLE_H) |
||||
add_definitions(-DHAVE_ARM_ACLE_H) |
||||
endif() |
||||
check_include_file(sys/auxv.h HAVE_SYS_AUXV_H) |
||||
if(HAVE_SYS_AUXV_H) |
||||
add_definitions(-DHAVE_SYS_AUXV_H) |
||||
endif() |
||||
check_include_file(sys/sdt.h HAVE_SYS_SDT_H) |
||||
if(HAVE_SYS_SDT_H) |
||||
add_definitions(-DHAVE_SYS_SDT_H) |
||||
endif() |
||||
check_include_file(unistd.h HAVE_UNISTD_H) |
||||
|
||||
# |
||||
# Check to see if we have large file support |
||||
# |
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) |
||||
check_type_size(off64_t OFF64_T) |
||||
if(HAVE_OFF64_T) |
||||
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) |
||||
else() |
||||
check_type_size(_off64_t _OFF64_T) |
||||
if(HAVE__OFF64_T) |
||||
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) |
||||
else() |
||||
check_type_size(__off64_t __OFF64_T) |
||||
endif() |
||||
endif() |
||||
set(CMAKE_REQUIRED_DEFINITIONS) # clear variable |
||||
|
||||
# |
||||
# Check for fseeko and other optional functions |
||||
# |
||||
check_function_exists(fseeko HAVE_FSEEKO) |
||||
if(NOT HAVE_FSEEKO) |
||||
add_definitions(-DNO_FSEEKO) |
||||
endif() |
||||
|
||||
check_function_exists(strerror HAVE_STRERROR) |
||||
if(NOT HAVE_STRERROR) |
||||
add_definitions(-DNO_STRERROR) |
||||
endif() |
||||
|
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L) |
||||
check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN) |
||||
if(HAVE_POSIX_MEMALIGN) |
||||
add_definitions(-DHAVE_POSIX_MEMALIGN) |
||||
endif() |
||||
set(CMAKE_REQUIRED_DEFINITIONS) |
||||
|
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1) |
||||
check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC) |
||||
if(HAVE_ALIGNED_ALLOC) |
||||
add_definitions(-DHAVE_ALIGNED_ALLOC) |
||||
endif() |
||||
set(CMAKE_REQUIRED_DEFINITIONS) |
||||
|
||||
# |
||||
# Check if we can hide zlib internal symbols that are linked between separate source files using hidden |
||||
# |
||||
check_c_source_compiles( |
||||
"#define Z_INTERNAL __attribute__((visibility (\"hidden\"))) |
||||
int Z_INTERNAL foo; |
||||
int main() { |
||||
return 0; |
||||
}" |
||||
HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility") |
||||
if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN) |
||||
add_definitions(-DHAVE_VISIBILITY_HIDDEN) |
||||
endif() |
||||
|
||||
# |
||||
# Check if we can hide zlib internal symbols that are linked between separate source files using internal |
||||
# |
||||
check_c_source_compiles( |
||||
"#define Z_INTERNAL __attribute__((visibility (\"internal\"))) |
||||
int Z_INTERNAL foo; |
||||
int main() { |
||||
return 0; |
||||
}" |
||||
HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility") |
||||
if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL) |
||||
add_definitions(-DHAVE_VISIBILITY_INTERNAL) |
||||
endif() |
||||
|
||||
# |
||||
# Check for __attribute__((aligned(x))) support in the compiler |
||||
# |
||||
check_c_source_compiles( |
||||
"int main(void) { |
||||
__attribute__((aligned(8))) int test = 0; |
||||
(void)test; |
||||
return 0; |
||||
}" |
||||
HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned") |
||||
if(HAVE_ATTRIBUTE_ALIGNED) |
||||
add_definitions(-DHAVE_ATTRIBUTE_ALIGNED) |
||||
endif() |
||||
|
||||
# |
||||
# check for __builtin_ctz() support in the compiler |
||||
# |
||||
check_c_source_compiles( |
||||
"int main(void) { |
||||
unsigned int zero = 0; |
||||
long test = __builtin_ctz(zero); |
||||
(void)test; |
||||
return 0; |
||||
}" |
||||
HAVE_BUILTIN_CTZ |
||||
) |
||||
if(HAVE_BUILTIN_CTZ) |
||||
add_definitions(-DHAVE_BUILTIN_CTZ) |
||||
endif() |
||||
|
||||
# |
||||
# check for __builtin_ctzll() support in the compiler |
||||
# |
||||
check_c_source_compiles( |
||||
"int main(void) { |
||||
unsigned int zero = 0; |
||||
long test = __builtin_ctzll(zero); |
||||
(void)test; |
||||
return 0; |
||||
}" |
||||
HAVE_BUILTIN_CTZLL |
||||
) |
||||
if(HAVE_BUILTIN_CTZLL) |
||||
add_definitions(-DHAVE_BUILTIN_CTZLL) |
||||
endif() |
||||
|
||||
# |
||||
# check for ptrdiff_t support |
||||
# |
||||
check_c_source_compiles( |
||||
"#include <stddef.h> |
||||
int main() { |
||||
ptrdiff_t *a; |
||||
(void)a; |
||||
return 0; |
||||
}" |
||||
HAVE_PTRDIFF_T |
||||
) |
||||
if(NOT HAVE_PTRDIFF_T) |
||||
set(NEED_PTRDIFF_T 1) |
||||
|
||||
check_type_size("void *" SIZEOF_DATA_PTR) |
||||
message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes") |
||||
|
||||
if(${SIZEOF_DATA_PTR} MATCHES "4") |
||||
set(PTRDIFF_TYPE "uint32_t") |
||||
elseif(${SIZEOF_DATA_PTR} MATCHES "8") |
||||
set(PTRDIFF_TYPE "uint64_t") |
||||
else() |
||||
message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit") |
||||
endif() |
||||
endif() |
||||
|
||||
if(MSVC) |
||||
add_definitions(-D_CRT_SECURE_NO_DEPRECATE) |
||||
add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) |
||||
endif() |
||||
|
||||
set(ZLIB_ARCH_SRCS) |
||||
set(ZLIB_ARCH_HDRS) |
||||
set(ARCHDIR "arch/generic") |
||||
if(BASEARCH_X86_FOUND) |
||||
set(ARCHDIR "arch/x86") |
||||
endif() |
||||
if(BASEARCH_ARM_FOUND) |
||||
set(ARCHDIR "arch/arm") |
||||
endif() |
||||
if(BASEARCH_PPC_FOUND) |
||||
set(ARCHDIR "arch/power") |
||||
endif() |
||||
if(BASEARCH_RISCV_FOUND) |
||||
set(ARCHDIR "arch/riscv") |
||||
endif() |
||||
|
||||
if(NOT CV_DISABLE_OPTIMIZATION) |
||||
if(BASEARCH_ARM_FOUND) |
||||
add_definitions(-DARM_FEATURES) |
||||
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") |
||||
if("${ARCH}" MATCHES "aarch64") |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
int main() { |
||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32); |
||||
}" |
||||
ARM_AUXV_HAS_CRC32 |
||||
) |
||||
if(ARM_AUXV_HAS_CRC32) |
||||
add_definitions(-DARM_AUXV_HAS_CRC32) |
||||
else() |
||||
message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") |
||||
endif() |
||||
else() |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
int main() { |
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); |
||||
}" |
||||
ARM_AUXV_HAS_CRC32 |
||||
) |
||||
if(ARM_AUXV_HAS_CRC32) |
||||
add_definitions(-DARM_AUXV_HAS_CRC32) |
||||
else() |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
#include <asm/hwcap.h> |
||||
int main() { |
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); |
||||
}" |
||||
ARM_HWCAP_HAS_CRC32 |
||||
) |
||||
if(ARM_HWCAP_HAS_CRC32) |
||||
add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP) |
||||
else() |
||||
message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") |
||||
endif() |
||||
endif() |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
int main() { |
||||
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON); |
||||
}" |
||||
ARM_AUXV_HAS_NEON |
||||
) |
||||
if(ARM_AUXV_HAS_NEON) |
||||
add_definitions(-DARM_AUXV_HAS_NEON) |
||||
else() |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
int main() { |
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON); |
||||
}" |
||||
ARM_AUXV_HAS_NEON |
||||
) |
||||
if (ARM_AUXV_HAS_NEON) |
||||
add_definitions(-DARM_AUXV_HAS_NEON) |
||||
else() |
||||
message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
endif() |
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h) |
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c) |
||||
if(WITH_ACLE) |
||||
check_acle_compiler_flag() |
||||
if(HAVE_ACLE_FLAG) |
||||
add_definitions(-DARM_ACLE) |
||||
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) |
||||
set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}") |
||||
list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS}) |
||||
else() |
||||
set(WITH_ACLE OFF) |
||||
endif() |
||||
else() |
||||
set(WITH_ACLE OFF) |
||||
endif() |
||||
if(WITH_NEON) |
||||
check_neon_compiler_flag() |
||||
if(NEON_AVAILABLE) |
||||
add_definitions(-DARM_NEON) |
||||
set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c |
||||
${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) |
||||
set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}") |
||||
if(MSVC) |
||||
add_definitions(-D__ARM_NEON__) |
||||
endif() |
||||
check_neon_ld4_intrinsics() |
||||
if(NEON_HAS_LD4) |
||||
add_definitions(-DARM_NEON_HASLD4) |
||||
endif() |
||||
else() |
||||
set(WITH_NEON OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_ARMV6) |
||||
check_armv6_compiler_flag() |
||||
if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN) |
||||
add_definitions(-DARM_SIMD) |
||||
set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c) |
||||
set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}") |
||||
list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS}) |
||||
if(HAVE_ARMV6_INTRIN) |
||||
add_definitions(-DARM_SIMD_INTRIN) |
||||
endif() |
||||
else() |
||||
set(WITH_ARMV6 OFF) |
||||
endif() |
||||
else() |
||||
set(WITH_ARMV6 OFF) |
||||
endif() |
||||
endif() |
||||
if(BASEARCH_PPC_FOUND) |
||||
# Common arch detection code |
||||
if(WITH_ALTIVEC) |
||||
check_ppc_intrinsics() |
||||
endif() |
||||
if(WITH_POWER8) |
||||
check_power8_intrinsics() |
||||
endif() |
||||
if(WITH_POWER9) |
||||
check_power9_intrinsics() |
||||
endif() |
||||
if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN) |
||||
add_definitions(-DPOWER_FEATURES) |
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h) |
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c) |
||||
endif() |
||||
# VMX specific options and files |
||||
if(WITH_ALTIVEC) |
||||
if(HAVE_VMX) |
||||
add_definitions(-DPPC_FEATURES) |
||||
if(HAVE_ALTIVEC) |
||||
add_definitions(-DPPC_VMX) |
||||
set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS}) |
||||
set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}") |
||||
else() |
||||
set(WITH_ALTIVEC OFF) |
||||
endif() |
||||
endif() |
||||
endif() |
||||
# Power8 specific options and files |
||||
if(WITH_POWER8) |
||||
if(HAVE_POWER8_INTRIN) |
||||
add_definitions(-DPOWER8_VSX) |
||||
set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c) |
||||
if("${ARCH}" MATCHES "powerpc64(le)?") |
||||
add_definitions(-DPOWER8_VSX_CRC32) |
||||
list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c) |
||||
endif() |
||||
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS}) |
||||
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_POWER8 OFF) |
||||
endif() |
||||
endif() |
||||
# Power9 specific options and files |
||||
if(WITH_POWER9) |
||||
if(HAVE_POWER9_INTRIN) |
||||
add_definitions(-DPOWER9) |
||||
set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS}) |
||||
set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_POWER9 OFF) |
||||
endif() |
||||
endif() |
||||
endif() |
||||
if(BASEARCH_RISCV_FOUND) |
||||
if(WITH_RVV) |
||||
check_rvv_intrinsics() |
||||
if(HAVE_RVV_INTRIN) |
||||
add_definitions(-DRISCV_FEATURES) |
||||
add_definitions(-DRISCV_RVV) |
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h) |
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) |
||||
# FIXME: we will not set compile flags for riscv_features.c when |
||||
# the kernels update hwcap or hwprobe for riscv |
||||
set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) |
||||
set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_RVV OFF) |
||||
endif() |
||||
endif() |
||||
endif() |
||||
if(BASEARCH_X86_FOUND) |
||||
add_definitions(-DX86_FEATURES) |
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h) |
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c) |
||||
if(MSVC) |
||||
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) |
||||
endif() |
||||
if(WITH_AVX2) |
||||
check_avx2_intrinsics() |
||||
if(HAVE_AVX2_INTRIN) |
||||
add_definitions(-DX86_AVX2) |
||||
set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c) |
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c) |
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c) |
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS}) |
||||
set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_AVX2 OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_AVX512) |
||||
check_avx512_intrinsics() |
||||
if(HAVE_AVX512_INTRIN) |
||||
add_definitions(-DX86_AVX512) |
||||
list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) |
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) |
||||
if(HAVE_MASK_INTRIN) |
||||
add_definitions(-DX86_MASK_INTRIN) |
||||
endif() |
||||
set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_AVX512 OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_AVX512VNNI) |
||||
check_avx512vnni_intrinsics() |
||||
if(HAVE_AVX512VNNI_INTRIN) |
||||
add_definitions(-DX86_AVX512VNNI) |
||||
list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS}) |
||||
set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_AVX512VNNI OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_SSE42) |
||||
check_sse42_intrinsics() |
||||
if(HAVE_SSE42_INTRIN) |
||||
add_definitions(-DX86_SSE42) |
||||
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) |
||||
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_SSE42 OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_SSE2) |
||||
check_sse2_intrinsics() |
||||
if(HAVE_SSE2_INTRIN) |
||||
add_definitions(-DX86_SSE2) |
||||
set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) |
||||
if(NOT ${ARCH} MATCHES "x86_64") |
||||
set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}") |
||||
add_definitions(-DX86_NOCHECK_SSE2) |
||||
endif() |
||||
else() |
||||
set(WITH_SSE2 OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_SSSE3) |
||||
check_ssse3_intrinsics() |
||||
if(HAVE_SSSE3_INTRIN) |
||||
add_definitions(-DX86_SSSE3) |
||||
set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) |
||||
set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_SSSE3 OFF) |
||||
endif() |
||||
endif() |
||||
if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42) |
||||
check_pclmulqdq_intrinsics() |
||||
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN) |
||||
add_definitions(-DX86_PCLMULQDQ_CRC) |
||||
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) |
||||
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") |
||||
|
||||
if(WITH_VPCLMULQDQ AND WITH_AVX512) |
||||
check_vpclmulqdq_intrinsics() |
||||
if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN) |
||||
add_definitions(-DX86_VPCLMULQDQ_CRC) |
||||
set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c) |
||||
list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) |
||||
set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") |
||||
else() |
||||
set(WITH_VPCLMULQDQ OFF) |
||||
endif() |
||||
else() |
||||
set(WITH_VPCLMULQDQ OFF) |
||||
endif() |
||||
else() |
||||
set(WITH_PCLMULQDQ OFF) |
||||
set(WITH_VPCLMULQDQ OFF) |
||||
endif() |
||||
else() |
||||
set(WITH_PCLMULQDQ OFF) |
||||
set(WITH_VPCLMULQDQ OFF) |
||||
endif() |
||||
check_xsave_intrinsics() |
||||
if(HAVE_XSAVE_INTRIN) |
||||
set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
|
||||
#============================================================================ |
||||
# zconf.h |
||||
#============================================================================ |
||||
|
||||
macro(generate_cmakein input output) |
||||
file(REMOVE ${output}) |
||||
file(STRINGS ${input} _lines) |
||||
foreach(_line IN LISTS _lines) |
||||
string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}") |
||||
string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}") |
||||
if(NEED_PTRDIFF_T) |
||||
string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}") |
||||
endif() |
||||
file(APPEND ${output} "${_line}\n") |
||||
endforeach() |
||||
endmacro(generate_cmakein) |
||||
|
||||
generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein ) |
||||
|
||||
#============================================================================ |
||||
# zlib |
||||
#============================================================================ |
||||
|
||||
set(ZLIB_PUBLIC_HDRS |
||||
${CMAKE_CURRENT_BINARY_DIR}/zconf.h |
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h |
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib.h |
||||
) |
||||
set(ZLIB_PRIVATE_HDRS |
||||
adler32_p.h |
||||
chunkset_tpl.h |
||||
compare256_rle.h |
||||
cpu_features.h |
||||
crc32_braid_p.h |
||||
crc32_braid_comb_p.h |
||||
crc32_braid_tbl.h |
||||
crc32_fold.h |
||||
deflate.h |
||||
deflate_p.h |
||||
functable.h |
||||
inffast_tpl.h |
||||
inffixed_tbl.h |
||||
inflate.h |
||||
inflate_p.h |
||||
inftrees.h |
||||
insert_string_tpl.h |
||||
match_tpl.h |
||||
trees.h |
||||
trees_emit.h |
||||
trees_tbl.h |
||||
zbuild.h |
||||
zendian.h |
||||
zutil.h |
||||
) |
||||
set(ZLIB_SRCS |
||||
adler32.c |
||||
adler32_fold.c |
||||
chunkset.c |
||||
compare256.c |
||||
compress.c |
||||
cpu_features.c |
||||
crc32_braid.c |
||||
crc32_braid_comb.c |
||||
crc32_fold.c |
||||
deflate.c |
||||
deflate_fast.c |
||||
deflate_huff.c |
||||
deflate_medium.c |
||||
deflate_quick.c |
||||
deflate_rle.c |
||||
deflate_slow.c |
||||
deflate_stored.c |
||||
functable.c |
||||
infback.c |
||||
inflate.c |
||||
inftrees.c |
||||
insert_string.c |
||||
insert_string_roll.c |
||||
slide_hash.c |
||||
trees.c |
||||
uncompr.c |
||||
zutil.c |
||||
) |
||||
|
||||
set(ZLIB_GZFILE_PRIVATE_HDRS |
||||
gzguts.h |
||||
) |
||||
set(ZLIB_GZFILE_SRCS |
||||
gzlib.c |
||||
${CMAKE_CURRENT_BINARY_DIR}/gzread.c |
||||
gzwrite.c |
||||
) |
||||
|
||||
set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) |
||||
list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS}) |
||||
|
||||
add_library(zlib STATIC ${ZLIB_ALL_SRCS}) |
||||
|
||||
target_include_directories(zlib PUBLIC |
||||
"$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>" |
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>") |
||||
|
||||
if(HAVE_UNISTD_H) |
||||
SET(ZCONF_UNISTD_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */") |
||||
else() |
||||
SET(ZCONF_UNISTD_LINE "#if 0 /* was set to #if 0 by configure/cmake/etc */") |
||||
endif() |
||||
if(NEED_PTRDIFF_T) |
||||
SET(ZCONF_PTRDIFF_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */") |
||||
else() |
||||
SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */") |
||||
endif() |
||||
|
||||
configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein |
||||
${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY) |
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in |
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY) |
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in |
||||
${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY) |
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty |
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY) |
||||
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes |
||||
-Wundef |
||||
-Wmissing-declarations |
||||
) |
||||
|
||||
set_target_properties(${ZLIB_LIBRARY} PROPERTIES |
||||
OUTPUT_NAME ${ZLIB_LIBRARY} |
||||
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" |
||||
COMPILE_PDB_NAME ${ZLIB_LIBRARY} |
||||
COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}" |
||||
ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH} |
||||
) |
||||
|
||||
if(ENABLE_SOLUTION_FOLDERS) |
||||
set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty") |
||||
endif() |
||||
|
||||
if(NOT BUILD_SHARED_LIBS) |
||||
ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) |
||||
endif() |
||||
|
||||
ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md) |
@ -0,0 +1,19 @@ |
||||
(C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
|
||||
This software is provided 'as-is', without any express or implied |
||||
warranty. In no event will the authors be held liable for any damages |
||||
arising from the use of this software. |
||||
|
||||
Permission is granted to anyone to use this software for any purpose, |
||||
including commercial applications, and to alter it and redistribute it |
||||
freely, subject to the following restrictions: |
||||
|
||||
1. The origin of this software must not be misrepresented; you must not |
||||
claim that you wrote the original software. If you use this software |
||||
in a product, an acknowledgment in the product documentation would be |
||||
appreciated but is not required. |
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be |
||||
misrepresented as being the original software. |
||||
|
||||
3. This notice may not be removed or altered from any source distribution. |
@ -0,0 +1,229 @@ |
||||
| CI | Stable | Develop | |
||||
|:---|:-------|:--------| |
||||
| GitHub Actions | [![Stable CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Astable) <br> [![Stable Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Astable) <br> [![Stable NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Astable) | [![Develop CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Adevelop) <br> [![Develop Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Adevelop) <br> [![Develop NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Adevelop) | |
||||
| CodeFactor | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) | |
||||
| OSS-Fuzz | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | |
||||
| Codecov | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) | |
||||
|
||||
## zlib-ng |
||||
*zlib data compression library for the next generation systems* |
||||
|
||||
Maintained by Hans Kristian Rosbach |
||||
aka Dead2 (zlib-ng àt circlestorm dót org) |
||||
|
||||
Features |
||||
-------- |
||||
|
||||
* Zlib compatible API with support for dual-linking |
||||
* Modernized native API based on zlib API for ease of porting |
||||
* Modern C11 syntax and a clean code layout |
||||
* Deflate medium and quick algorithms based on Intel’s zlib fork |
||||
* Support for CPU intrinsics when available |
||||
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX |
||||
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z |
||||
* Hash table implementation using CRC32-C intrinsics on x86 and ARM |
||||
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX |
||||
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV |
||||
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX |
||||
* Support for hardware-accelerated deflate using IBM Z DFLTCC |
||||
* Unaligned memory read/writes and large bit buffer improvements |
||||
* Includes improvements from Cloudflare and Intel forks |
||||
* Configure, CMake, and NMake build system support |
||||
* Comprehensive set of CMake unit tests |
||||
* Code sanitizers, fuzzing, and coverage |
||||
* GitHub Actions continuous integration on Windows, macOS, and Linux |
||||
* Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu |
||||
|
||||
|
||||
History |
||||
------- |
||||
|
||||
The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting |
||||
implemented into the official zlib repository. |
||||
|
||||
Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue |
||||
for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a |
||||
lower threshold for code change. |
||||
|
||||
zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br> |
||||
That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds |
||||
for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment. |
||||
|
||||
Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds |
||||
cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute. |
||||
|
||||
I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other |
||||
smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br> |
||||
The result is a better performing and easier to maintain zlib-ng. |
||||
|
||||
A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both |
||||
small and big improvements, or valuable testing. |
||||
|
||||
|
||||
Build |
||||
----- |
||||
<sup>Please read LICENSE.md, it is very simple and very liberal.</sup> |
||||
|
||||
There are two ways to build zlib-ng: |
||||
|
||||
### Cmake |
||||
|
||||
To build zlib-ng using the cross-platform makefile generator cmake. |
||||
|
||||
``` |
||||
cmake . |
||||
cmake --build . --config Release |
||||
ctest --verbose -C Release |
||||
``` |
||||
|
||||
Alternatively, you can use the cmake configuration GUI tool ccmake: |
||||
|
||||
``` |
||||
ccmake . |
||||
``` |
||||
|
||||
### Configure |
||||
|
||||
To build zlib-ng using the bash configure script: |
||||
|
||||
``` |
||||
./configure |
||||
make |
||||
make test |
||||
``` |
||||
|
||||
Build Options |
||||
------------- |
||||
|
||||
| CMake | configure | Description | Default | |
||||
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------| |
||||
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF | |
||||
| ZLIB_ENABLE_TESTS | | Build test binaries | ON | |
||||
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON | |
||||
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON | |
||||
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON | |
||||
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | |
||||
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF | |
||||
| WITH_GTEST | | Build gtest_zlib | ON | |
||||
| WITH_FUZZERS | | Build test/fuzz | OFF | |
||||
| WITH_BENCHMARKS | | Build test/benchmarks | OFF | |
||||
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | |
||||
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | |
||||
|
||||
|
||||
Install |
||||
------- |
||||
|
||||
WARNING: We do not recommend manually installing unless you really know what you are doing, because this can |
||||
potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng |
||||
can make the whole system unusable, requiring recovery or reinstall. |
||||
If you still want a manual install, we recommend using the /opt/ path prefix. |
||||
|
||||
For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through |
||||
the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program |
||||
will temporarily attempt to use zlib-ng instead, without risking system-wide instability. |
||||
|
||||
``` |
||||
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program |
||||
``` |
||||
|
||||
### Cmake |
||||
|
||||
To install zlib-ng system-wide using cmake: |
||||
|
||||
```sh or powershell |
||||
cmake --build . --target install |
||||
``` |
||||
|
||||
### Configure |
||||
|
||||
To install zlib-ng system-wide using the configure script: |
||||
|
||||
```sh |
||||
make install |
||||
``` |
||||
|
||||
### CPack |
||||
|
||||
After building with cmake, an installation package can be created using cpack. By default a tgz package is created, |
||||
but you can append `-G <format>` to each command to generate alternative packages types (TGZ, ZIP, RPM, DEB). To easily |
||||
create a rpm or deb package, you would use `-G RPM` or `-G DEB` respectively. |
||||
|
||||
```sh or powershell |
||||
cd build |
||||
cpack --config CPackConfig.cmake |
||||
cpack --config CPackSourceConfig.cmake |
||||
``` |
||||
|
||||
### Vcpkg |
||||
|
||||
Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager: |
||||
|
||||
```sh or powershell |
||||
git clone https://github.com/Microsoft/vcpkg.git |
||||
cd vcpkg |
||||
./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell |
||||
./vcpkg integrate install |
||||
./vcpkg install zlib-ng |
||||
``` |
||||
|
||||
The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors. |
||||
If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository. |
||||
|
||||
Contributing |
||||
------------ |
||||
|
||||
Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github. |
||||
Help with testing and reviewing pull requests etc is also very much appreciated. |
||||
|
||||
Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing) |
||||
|
||||
Acknowledgments |
||||
---------------- |
||||
|
||||
Thanks go out to all the people and companies who have taken the time to contribute |
||||
code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you. |
||||
|
||||
The deflate format used by zlib was defined by Phil Katz.<br> |
||||
The deflate and zlib specifications were written by L. Peter Deutsch. |
||||
|
||||
zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression). |
||||
|
||||
|
||||
Advanced Build Options |
||||
---------------------- |
||||
|
||||
| CMake | configure | Description | Default | |
||||
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------| |
||||
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) | |
||||
| WITH_AVX2 | | Build with AVX2 intrinsics | ON | |
||||
| WITH_AVX512 | | Build with AVX512 intrinsics | ON | |
||||
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON | |
||||
| WITH_SSE2 | | Build with SSE2 intrinsics | ON | |
||||
| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON | |
||||
| WITH_SSE42 | | Build with SSE42 intrinsics | ON | |
||||
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON | |
||||
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON | |
||||
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON | |
||||
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON | |
||||
| WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON | |
||||
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON | |
||||
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON | |
||||
| WITH_RVV | | Build with RVV intrinsics | ON | |
||||
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON | |
||||
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF | |
||||
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF | |
||||
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON | |
||||
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF | |
||||
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF | |
||||
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF | |
||||
| ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON | |
||||
|
||||
|
||||
Related Projects |
||||
---------------- |
||||
|
||||
* Fork of the popular minizip https://github.com/zlib-ng/minizip-ng |
||||
* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench |
||||
* Python tool to benchmark pigz https://github.com/zlib-ng/pigzbench |
||||
* 3rd party patches for zlib-ng compatibility https://github.com/zlib-ng/patches |
@ -0,0 +1,115 @@ |
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "functable.h" |
||||
#include "adler32_p.h" |
||||
|
||||
/* ========================================================================= */ |
||||
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
uint32_t sum2; |
||||
unsigned n; |
||||
|
||||
/* split Adler-32 into component sums */ |
||||
sum2 = (adler >> 16) & 0xffff; |
||||
adler &= 0xffff; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (UNLIKELY(len == 1)) |
||||
return adler32_len_1(adler, buf, sum2); |
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */ |
||||
if (UNLIKELY(buf == NULL)) |
||||
return 1L; |
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */ |
||||
if (UNLIKELY(len < 16)) |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
|
||||
/* do length NMAX blocks -- requires just one modulo operation */ |
||||
while (len >= NMAX) { |
||||
len -= NMAX; |
||||
#ifdef UNROLL_MORE |
||||
n = NMAX / 16; /* NMAX is divisible by 16 */ |
||||
#else |
||||
n = NMAX / 8; /* NMAX is divisible by 8 */ |
||||
#endif |
||||
do { |
||||
#ifdef UNROLL_MORE |
||||
DO16(adler, sum2, buf); /* 16 sums unrolled */ |
||||
buf += 16; |
||||
#else |
||||
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */ |
||||
buf += 8; |
||||
#endif |
||||
} while (--n); |
||||
adler %= BASE; |
||||
sum2 %= BASE; |
||||
} |
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */ |
||||
return adler32_len_64(adler, buf, len, sum2); |
||||
} |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) { |
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) { |
||||
return functable.adler32(adler, buf, len); |
||||
} |
||||
#endif |
||||
|
||||
/* ========================================================================= */ |
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) { |
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) { |
||||
return functable.adler32(adler, buf, len); |
||||
} |
||||
#endif |
||||
|
||||
/* ========================================================================= */ |
||||
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) { |
||||
uint32_t sum1; |
||||
uint32_t sum2; |
||||
unsigned rem; |
||||
|
||||
/* for negative len, return invalid adler32 as a clue for debugging */ |
||||
if (len2 < 0) |
||||
return 0xffffffff; |
||||
|
||||
/* the derivation of this formula is left as an exercise for the reader */ |
||||
len2 %= BASE; /* assumes len2 >= 0 */ |
||||
rem = (unsigned)len2; |
||||
sum1 = adler1 & 0xffff; |
||||
sum2 = rem * sum1; |
||||
sum2 %= BASE; |
||||
sum1 += (adler2 & 0xffff) + BASE - 1; |
||||
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; |
||||
if (sum1 >= BASE) sum1 -= BASE; |
||||
if (sum1 >= BASE) sum1 -= BASE; |
||||
if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1); |
||||
if (sum2 >= BASE) sum2 -= BASE; |
||||
return sum1 | (sum2 << 16); |
||||
} |
||||
|
||||
/* ========================================================================= */ |
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) { |
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2); |
||||
} |
||||
|
||||
unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) { |
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) { |
||||
return adler32_combine_(adler1, adler2, len2); |
||||
} |
||||
#endif |
@ -0,0 +1,16 @@ |
||||
/* adler32_fold.c -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "functable.h" |
||||
#include "adler32_fold.h" |
||||
|
||||
#include <limits.h> |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
adler = functable.adler32(adler, src, len); |
||||
memcpy(dst, src, len); |
||||
return adler; |
||||
} |
@ -0,0 +1,11 @@ |
||||
/* adler32_fold.h -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef ADLER32_FOLD_H_ |
||||
#define ADLER32_FOLD_H_ |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
|
||||
#endif |
@ -0,0 +1,70 @@ |
||||
/* adler32_p.h -- Private inline functions and macros shared with
|
||||
* different computation of the Adler-32 checksum |
||||
* of a data stream. |
||||
* Copyright (C) 1995-2011, 2016 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef ADLER32_P_H |
||||
#define ADLER32_P_H |
||||
|
||||
#define BASE 65521U /* largest prime smaller than 65536 */ |
||||
#define NMAX 5552 |
||||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ |
||||
|
||||
#define DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);} |
||||
#define DO2(sum1, sum2, buf, i) {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);} |
||||
#define DO4(sum1, sum2, buf, i) {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);} |
||||
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);} |
||||
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);} |
||||
|
||||
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) { |
||||
adler += buf[0]; |
||||
adler %= BASE; |
||||
sum2 += adler; |
||||
sum2 %= BASE; |
||||
return adler | (sum2 << 16); |
||||
} |
||||
|
||||
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) { |
||||
while (len) { |
||||
--len; |
||||
adler += *buf++; |
||||
sum2 += adler; |
||||
} |
||||
adler %= BASE; |
||||
sum2 %= BASE; /* only added so many BASE's */ |
||||
/* return recombined sums */ |
||||
return adler | (sum2 << 16); |
||||
} |
||||
|
||||
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) { |
||||
while (len--) { |
||||
*dst = *buf++; |
||||
adler += *dst++; |
||||
sum2 += adler; |
||||
} |
||||
adler %= BASE; |
||||
sum2 %= BASE; /* only added so many BASE's */ |
||||
/* return recombined sums */ |
||||
return adler | (sum2 << 16); |
||||
} |
||||
|
||||
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) { |
||||
#ifdef UNROLL_MORE |
||||
while (len >= 16) { |
||||
len -= 16; |
||||
DO16(adler, sum2, buf); |
||||
buf += 16; |
||||
#else |
||||
while (len >= 8) { |
||||
len -= 8; |
||||
DO8(adler, sum2, buf, 0); |
||||
buf += 8; |
||||
#endif |
||||
} |
||||
/* Process tail (len < 16). */ |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
} |
||||
|
||||
#endif /* ADLER32_P_H */ |
@ -0,0 +1,2 @@ |
||||
# ignore Makefiles; they're all automatically generated |
||||
Makefile |
@ -0,0 +1,85 @@ |
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
ACLEFLAG=
|
||||
NEONFLAG=
|
||||
ARMV6FLAG=
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
adler32_neon.o adler32_neon.lo \
|
||||
arm_features.o arm_features.lo \
|
||||
chunkset_neon.o chunkset_neon.lo \
|
||||
compare256_neon.o compare256_neon.lo \
|
||||
crc32_acle.o crc32_acle.lo \
|
||||
slide_hash_neon.o slide_hash_neon.lo \
|
||||
slide_hash_armv6.o slide_hash_armv6.lo \
|
||||
insert_string_acle.o insert_string_acle.lo
|
||||
|
||||
adler32_neon.o: |
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
adler32_neon.lo: |
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
arm_features.o: |
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
arm_features.lo: |
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
chunkset_neon.o: |
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
chunkset_neon.lo: |
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
compare256_neon.o: |
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
compare256_neon.lo: |
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
crc32_acle.o: |
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
crc32_acle.lo: |
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
slide_hash_neon.o: |
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
slide_hash_neon.lo: |
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
slide_hash_armv6.o: |
||||
$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
|
||||
|
||||
slide_hash_armv6.lo: |
||||
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
|
||||
|
||||
insert_string_acle.o: |
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
insert_string_acle.lo: |
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
mostlyclean: clean |
||||
clean: |
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean |
||||
rm -f Makefile
|
@ -0,0 +1,35 @@ |
||||
#ifndef ARM_ACLE_INTRINS_H |
||||
#define ARM_ACLE_INTRINS_H |
||||
|
||||
#include <stdint.h> |
||||
#ifdef _MSC_VER |
||||
# include <intrin.h> |
||||
#elif defined(HAVE_ARM_ACLE_H) |
||||
# include <arm_acle.h> |
||||
#endif |
||||
|
||||
#ifdef ARM_ACLE |
||||
#if defined(__aarch64__) |
||||
# define Z_TARGET_CRC Z_TARGET("+crc") |
||||
#else |
||||
# define Z_TARGET_CRC |
||||
#endif |
||||
#endif |
||||
|
||||
#ifdef ARM_SIMD |
||||
#ifdef _MSC_VER |
||||
typedef uint32_t uint16x2_t; |
||||
|
||||
#define __uqsub16 _arm_uqsub16 |
||||
#elif !defined(ARM_SIMD_INTRIN) |
||||
typedef uint32_t uint16x2_t; |
||||
|
||||
static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) { |
||||
uint16x2_t __c; |
||||
__asm__ __volatile__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); |
||||
return __c; |
||||
} |
||||
#endif |
||||
#endif |
||||
|
||||
#endif // include guard ARM_ACLE_INTRINS_H
|
@ -0,0 +1,215 @@ |
||||
/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* Copyright (C) 2017 ARM Holdings Inc. |
||||
* Authors: |
||||
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com> |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#ifdef ARM_NEON |
||||
#include "neon_intrins.h" |
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
|
||||
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) { |
||||
static const uint16_t ALIGNED_(16) taps[64] = { |
||||
64, 63, 62, 61, 60, 59, 58, 57, |
||||
56, 55, 54, 53, 52, 51, 50, 49, |
||||
48, 47, 46, 45, 44, 43, 42, 41, |
||||
40, 39, 38, 37, 36, 35, 34, 33, |
||||
32, 31, 30, 29, 28, 27, 26, 25, |
||||
24, 23, 22, 21, 20, 19, 18, 17, |
||||
16, 15, 14, 13, 12, 11, 10, 9, |
||||
8, 7, 6, 5, 4, 3, 2, 1 }; |
||||
|
||||
uint32x4_t adacc = vdupq_n_u32(0); |
||||
uint32x4_t s2acc = vdupq_n_u32(0); |
||||
uint32x4_t s2acc_0 = vdupq_n_u32(0); |
||||
uint32x4_t s2acc_1 = vdupq_n_u32(0); |
||||
uint32x4_t s2acc_2 = vdupq_n_u32(0); |
||||
|
||||
adacc = vsetq_lane_u32(s[0], adacc, 0); |
||||
s2acc = vsetq_lane_u32(s[1], s2acc, 0); |
||||
|
||||
uint32x4_t s3acc = vdupq_n_u32(0); |
||||
uint32x4_t adacc_prev = adacc; |
||||
|
||||
uint16x8_t s2_0, s2_1, s2_2, s2_3; |
||||
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0); |
||||
|
||||
uint16x8_t s2_4, s2_5, s2_6, s2_7; |
||||
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); |
||||
|
||||
size_t num_iter = len >> 2; |
||||
int rem = len & 3; |
||||
|
||||
for (size_t i = 0; i < num_iter; ++i) { |
||||
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf); |
||||
|
||||
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
|
||||
* bit instruction, we'll have to make due summing to 16 bits first */ |
||||
uint16x8x2_t hsum, hsum_fold; |
||||
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]); |
||||
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]); |
||||
|
||||
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]); |
||||
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]); |
||||
|
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[0]); |
||||
s3acc = vaddq_u32(s3acc, adacc_prev); |
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[1]); |
||||
|
||||
/* If we do straight widening additions to the 16 bit values, we don't incur
|
||||
* the usual penalties of a pairwise add. We can defer the multiplications |
||||
* until the very end. These will not overflow because we are incurring at |
||||
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be |
||||
* summed into once. This means for the maximum input size, the largest value |
||||
* we will see is 255 * 102 = 26010, safely under uint16 max */ |
||||
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); |
||||
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]); |
||||
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1])); |
||||
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]); |
||||
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); |
||||
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]); |
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3])); |
||||
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]); |
||||
|
||||
adacc_prev = adacc; |
||||
buf += 64; |
||||
} |
||||
|
||||
s3acc = vshlq_n_u32(s3acc, 6); |
||||
|
||||
if (rem) { |
||||
uint32x4_t s3acc_0 = vdupq_n_u32(0); |
||||
while (rem--) { |
||||
uint8x16_t d0 = vld1q_u8(buf); |
||||
uint16x8_t adler; |
||||
adler = vpaddlq_u8(d0); |
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); |
||||
s2_7 = vaddw_high_u8(s2_7, d0); |
||||
adacc = vpadalq_u16(adacc, adler); |
||||
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev); |
||||
adacc_prev = adacc; |
||||
buf += 16; |
||||
} |
||||
|
||||
s3acc_0 = vshlq_n_u32(s3acc_0, 4); |
||||
s3acc = vaddq_u32(s3acc_0, s3acc); |
||||
} |
||||
|
||||
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps); |
||||
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32); |
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); |
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); |
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1); |
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1)); |
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2); |
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2)); |
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3); |
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3)); |
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4); |
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4)); |
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5); |
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5)); |
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6); |
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6)); |
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7); |
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7)); |
||||
|
||||
s2acc = vaddq_u32(s2acc_0, s2acc); |
||||
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); |
||||
s2acc = vaddq_u32(s2acc, s2acc_2); |
||||
|
||||
uint32x2_t adacc2, s2acc2, as; |
||||
s2acc = vaddq_u32(s2acc, s3acc); |
||||
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); |
||||
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); |
||||
as = vpadd_u32(adacc2, s2acc2); |
||||
s[0] = vget_lane_u32(as, 0); |
||||
s[1] = vget_lane_u32(as, 1); |
||||
} |
||||
|
||||
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) { |
||||
unsigned int i; |
||||
for (i = 0; i < len; ++i) { |
||||
pair[0] += buf[i]; |
||||
pair[1] += pair[0]; |
||||
} |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
/* split Adler-32 into component sums */ |
||||
uint32_t sum2 = (adler >> 16) & 0xffff; |
||||
adler &= 0xffff; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (len == 1) |
||||
return adler32_len_1(adler, buf, sum2); |
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */ |
||||
if (buf == NULL) |
||||
return 1L; |
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */ |
||||
if (len < 16) |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
|
||||
uint32_t pair[2]; |
||||
int n = NMAX; |
||||
unsigned int done = 0; |
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file). |
||||
*/ |
||||
pair[0] = adler; |
||||
pair[1] = sum2; |
||||
|
||||
/* If memory is not SIMD aligned, do scalar sums to an aligned
|
||||
* offset, provided that doing so doesn't completely eliminate |
||||
* SIMD operation. Aligned loads are still faster on ARM, even |
||||
* though there's no explicit aligned load instruction */ |
||||
unsigned int align_offset = ((uintptr_t)buf & 15); |
||||
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0; |
||||
|
||||
if (align_offset && len >= (16 + align_adj)) { |
||||
NEON_handle_tail(pair, buf, align_adj); |
||||
n -= align_adj; |
||||
done += align_adj; |
||||
|
||||
} else { |
||||
/* If here, we failed the len criteria test, it wouldn't be
|
||||
* worthwhile to do scalar aligning sums */ |
||||
align_adj = 0; |
||||
} |
||||
|
||||
while (done < len) { |
||||
int remaining = (int)(len - done); |
||||
n = MIN(remaining, (done == align_adj) ? n : NMAX); |
||||
|
||||
if (n < 16) |
||||
break; |
||||
|
||||
NEON_accum32(pair, buf + done, n >> 4); |
||||
pair[0] %= BASE; |
||||
pair[1] %= BASE; |
||||
|
||||
int actual_nsums = (n >> 4) << 4; |
||||
done += actual_nsums; |
||||
} |
||||
|
||||
/* Handle the tail elements. */ |
||||
if (done < len) { |
||||
NEON_handle_tail(pair, (buf + done), len - done); |
||||
pair[0] %= BASE; |
||||
pair[1] %= BASE; |
||||
} |
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ |
||||
return (pair[1] << 16) | pair[0]; |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,100 @@ |
||||
#include "../../zbuild.h" |
||||
#include "arm_features.h" |
||||
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) |
||||
# include <sys/auxv.h> |
||||
# ifdef ARM_ASM_HWCAP |
||||
# include <asm/hwcap.h> |
||||
# endif |
||||
#elif defined(__FreeBSD__) && defined(__aarch64__) |
||||
# include <machine/armreg.h> |
||||
# ifndef ID_AA64ISAR0_CRC32_VAL |
||||
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32 |
||||
# endif |
||||
#elif defined(__APPLE__) |
||||
# if !defined(_DARWIN_C_SOURCE) |
||||
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */ |
||||
# endif |
||||
# include <sys/sysctl.h> |
||||
#elif defined(_WIN32) |
||||
# include <windows.h> |
||||
#endif |
||||
|
||||
static int arm_has_crc32() { |
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32) |
||||
# ifdef HWCAP_CRC32 |
||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0; |
||||
# else |
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0; |
||||
# endif |
||||
#elif defined(__FreeBSD__) && defined(__aarch64__) |
||||
return getenv("QEMU_EMULATING") == NULL |
||||
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE; |
||||
#elif defined(__APPLE__) |
||||
int hascrc32; |
||||
size_t size = sizeof(hascrc32); |
||||
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0 |
||||
&& hascrc32 == 1; |
||||
#elif defined(_WIN32) |
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); |
||||
#elif defined(ARM_NOCHECK_ACLE) |
||||
return 1; |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
|
||||
/* AArch64 has neon. */ |
||||
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) |
||||
static inline int arm_has_neon() { |
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON) |
||||
# ifdef HWCAP_ARM_NEON |
||||
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0; |
||||
# else |
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0; |
||||
# endif |
||||
#elif defined(__APPLE__) |
||||
int hasneon; |
||||
size_t size = sizeof(hasneon); |
||||
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0 |
||||
&& hasneon == 1; |
||||
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION) |
||||
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) |
||||
return 1; /* Always supported */ |
||||
# endif |
||||
#endif |
||||
|
||||
#if defined(ARM_NOCHECK_NEON) |
||||
return 1; |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
#endif |
||||
|
||||
/* AArch64 does not have ARMv6 SIMD. */ |
||||
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) |
||||
static inline int arm_has_simd() { |
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) |
||||
const char *platform = (const char *)getauxval(AT_PLATFORM); |
||||
return strncmp(platform, "v6l", 3) == 0 |
||||
|| strncmp(platform, "v7l", 3) == 0 |
||||
|| strncmp(platform, "v8l", 3) == 0; |
||||
#elif defined(ARM_NOCHECK_SIMD) |
||||
return 1; |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
#endif |
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) { |
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) |
||||
features->has_simd = 0; /* never available */ |
||||
features->has_neon = 1; /* always available */ |
||||
#else |
||||
features->has_simd = arm_has_simd(); |
||||
features->has_neon = arm_has_neon(); |
||||
#endif |
||||
features->has_crc32 = arm_has_crc32(); |
||||
} |
@ -0,0 +1,16 @@ |
||||
/* arm_features.h -- check for ARM features.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef ARM_H_ |
||||
#define ARM_H_ |
||||
|
||||
struct arm_cpu_features { |
||||
int has_simd; |
||||
int has_neon; |
||||
int has_crc32; |
||||
}; |
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features); |
||||
|
||||
#endif /* ARM_H_ */ |
@ -0,0 +1,99 @@ |
||||
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef ARM_NEON |
||||
#include "neon_intrins.h" |
||||
#include "../../zbuild.h" |
||||
#include "../generic/chunk_permute_table.h" |
||||
|
||||
typedef uint8x16_t chunk_t; |
||||
|
||||
#define CHUNK_SIZE 16 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
#define HAVE_CHUNK_MAG |
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = { |
||||
{0, 1}, /* 3 */ |
||||
{0, 0}, /* don't care */ |
||||
{1 * 32, 1}, /* 5 */ |
||||
{2 * 32, 4}, /* 6 */ |
||||
{3 * 32, 2}, /* 7 */ |
||||
{0 * 32, 0}, /* don't care */ |
||||
{4 * 32, 7}, /* 9 */ |
||||
{5 * 32, 6}, /* 10 */ |
||||
{6 * 32, 5}, /* 11 */ |
||||
{7 * 32, 4}, /* 12 */ |
||||
{8 * 32, 3}, /* 13 */ |
||||
{9 * 32, 2}, /* 14 */ |
||||
{10 * 32, 1},/* 15 */ |
||||
}; |
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
uint16_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
uint32_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
uint64_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_neon |
||||
#define CHUNKCOPY chunkcopy_neon |
||||
#define CHUNKUNROLL chunkunroll_neon |
||||
#define CHUNKMEMSET chunkmemset_neon |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
*chunk = vld1q_u8(s); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
vst1q_u8(out, *chunk); |
||||
} |
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { |
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; |
||||
*chunk_rem = lut_rem.remval; |
||||
|
||||
/* See note in chunkset_ssse3.c for why this is ok */ |
||||
__msan_unpoison(buf + dist, 16 - dist); |
||||
|
||||
/* This version of table is only available on aarch64 */ |
||||
#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) |
||||
uint8x16_t ret_vec = vld1q_u8(buf); |
||||
|
||||
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx); |
||||
return vqtbl1q_u8(ret_vec, perm_vec); |
||||
#else |
||||
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1; |
||||
perm_vec0 = vld1_u8(permute_table + lut_rem.idx); |
||||
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8); |
||||
a = vld1_u8(buf); |
||||
b = vld1_u8(buf + 8); |
||||
ret0 = vtbl1_u8(a, perm_vec0); |
||||
uint8x8x2_t ab = {{a, b}}; |
||||
ret1 = vtbl2_u8(ab, perm_vec1); |
||||
return vcombine_u8(ret0, ret1); |
||||
#endif |
||||
} |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_neon |
||||
|
||||
#include "inffast_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,59 @@ |
||||
/* compare256_neon.c - NEON version of compare256
|
||||
* Copyright (C) 2022 Nathan Moinvaziri |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
|
||||
#include "fallback_builtins.h" |
||||
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) |
||||
#include "neon_intrins.h" |
||||
|
||||
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
uint8x16_t a, b, cmp; |
||||
uint64_t lane; |
||||
|
||||
a = vld1q_u8(src0); |
||||
b = vld1q_u8(src1); |
||||
|
||||
cmp = veorq_u8(a, b); |
||||
|
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); |
||||
if (lane) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; |
||||
return len + match_byte; |
||||
} |
||||
len += 8; |
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); |
||||
if (lane) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8; |
||||
return len + match_byte; |
||||
} |
||||
len += 8; |
||||
|
||||
src0 += 16, src1 += 16; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_neon_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_neon |
||||
#define COMPARE256 compare256_neon_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_neon |
||||
#define COMPARE256 compare256_neon_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,78 @@ |
||||
/* crc32_acle.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler |
||||
* Copyright (C) 2016 Yang Zhang |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
*/ |
||||
|
||||
#ifdef ARM_ACLE |
||||
#include "acle_intrins.h" |
||||
#include "../../zbuild.h" |
||||
|
||||
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) { |
||||
Z_REGISTER uint32_t c; |
||||
Z_REGISTER const uint16_t *buf2; |
||||
Z_REGISTER const uint32_t *buf4; |
||||
Z_REGISTER const uint64_t *buf8; |
||||
|
||||
c = ~crc; |
||||
|
||||
if (UNLIKELY(len == 1)) { |
||||
c = __crc32b(c, *buf); |
||||
c = ~c; |
||||
return c; |
||||
} |
||||
|
||||
if ((ptrdiff_t)buf & (sizeof(uint64_t) - 1)) { |
||||
if (len && ((ptrdiff_t)buf & 1)) { |
||||
c = __crc32b(c, *buf++); |
||||
len--; |
||||
} |
||||
|
||||
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) { |
||||
buf2 = (const uint16_t *) buf; |
||||
c = __crc32h(c, *buf2++); |
||||
len -= sizeof(uint16_t); |
||||
buf4 = (const uint32_t *) buf2; |
||||
} else { |
||||
buf4 = (const uint32_t *) buf; |
||||
} |
||||
|
||||
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) { |
||||
c = __crc32w(c, *buf4++); |
||||
len -= sizeof(uint32_t); |
||||
} |
||||
|
||||
buf8 = (const uint64_t *) buf4; |
||||
} else { |
||||
buf8 = (const uint64_t *) buf; |
||||
} |
||||
|
||||
while (len >= sizeof(uint64_t)) { |
||||
c = __crc32d(c, *buf8++); |
||||
len -= sizeof(uint64_t); |
||||
} |
||||
|
||||
if (len >= sizeof(uint32_t)) { |
||||
buf4 = (const uint32_t *) buf8; |
||||
c = __crc32w(c, *buf4++); |
||||
len -= sizeof(uint32_t); |
||||
buf2 = (const uint16_t *) buf4; |
||||
} else { |
||||
buf2 = (const uint16_t *) buf8; |
||||
} |
||||
|
||||
if (len >= sizeof(uint16_t)) { |
||||
c = __crc32h(c, *buf2++); |
||||
len -= sizeof(uint16_t); |
||||
} |
||||
|
||||
buf = (const unsigned char *) buf2; |
||||
if (len) { |
||||
c = __crc32b(c, *buf); |
||||
} |
||||
|
||||
c = ~c; |
||||
return c; |
||||
} |
||||
#endif |
@ -0,0 +1,24 @@ |
||||
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
*/ |
||||
|
||||
#ifdef ARM_ACLE |
||||
#include "acle_intrins.h" |
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
#define HASH_CALC(s, h, val) \ |
||||
h = __crc32w(0, val) |
||||
|
||||
#define HASH_CALC_VAR h |
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0 |
||||
|
||||
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle |
||||
#define INSERT_STRING Z_TARGET_CRC insert_string_acle |
||||
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle |
||||
|
||||
#include "../../insert_string_tpl.h" |
||||
#endif |
@ -0,0 +1,58 @@ |
||||
#ifndef ARM_NEON_INTRINS_H |
||||
#define ARM_NEON_INTRINS_H |
||||
|
||||
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) |
||||
/* arm64_neon.h is MSVC specific */ |
||||
# include <arm64_neon.h> |
||||
#else |
||||
# include <arm_neon.h> |
||||
#endif |
||||
|
||||
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) |
||||
/* Compatibility shim for the _high family of functions */ |
||||
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) |
||||
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) |
||||
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) |
||||
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) |
||||
#endif |
||||
|
||||
#ifdef ARM_NEON |
||||
|
||||
#define vqsubq_u16_x4_x1(out, a, b) do { \ |
||||
out.val[0] = vqsubq_u16(a.val[0], b); \
|
||||
out.val[1] = vqsubq_u16(a.val[1], b); \
|
||||
out.val[2] = vqsubq_u16(a.val[2], b); \
|
||||
out.val[3] = vqsubq_u16(a.val[3], b); \
|
||||
} while (0) |
||||
|
||||
|
||||
# ifndef ARM_NEON_HASLD4 |
||||
|
||||
static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) { |
||||
uint16x8x4_t ret = (uint16x8x4_t) {{ |
||||
vld1q_u16(a), |
||||
vld1q_u16(a+8), |
||||
vld1q_u16(a+16), |
||||
vld1q_u16(a+24)}}; |
||||
return ret; |
||||
} |
||||
|
||||
static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) { |
||||
uint8x16x4_t ret = (uint8x16x4_t) {{ |
||||
vld1q_u8(a), |
||||
vld1q_u8(a+16), |
||||
vld1q_u8(a+32), |
||||
vld1q_u8(a+48)}}; |
||||
return ret; |
||||
} |
||||
|
||||
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { |
||||
vst1q_u16(p, a.val[0]); |
||||
vst1q_u16(p + 8, a.val[1]); |
||||
vst1q_u16(p + 16, a.val[2]); |
||||
vst1q_u16(p + 24, a.val[3]); |
||||
} |
||||
# endif // HASLD4 check
|
||||
#endif |
||||
|
||||
#endif // include guard ARM_NEON_INTRINS_H
|
@ -0,0 +1,47 @@ |
||||
/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
|
||||
* Copyright (C) 2023 Cameron Cawley |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#if defined(ARM_SIMD) |
||||
#include "acle_intrins.h" |
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
/* SIMD version of hash_chain rebase */ |
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { |
||||
Z_REGISTER uint16x2_t v; |
||||
uint16x2_t p0, p1, p2, p3; |
||||
Z_REGISTER size_t n; |
||||
|
||||
size_t size = entries*sizeof(table[0]); |
||||
Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err"); |
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size"); |
||||
v = wsize | (wsize << 16); |
||||
|
||||
n = size / (sizeof(uint16x2_t) * 4); |
||||
do { |
||||
p0 = *((const uint16x2_t *)(table)); |
||||
p1 = *((const uint16x2_t *)(table+2)); |
||||
p2 = *((const uint16x2_t *)(table+4)); |
||||
p3 = *((const uint16x2_t *)(table+6)); |
||||
p0 = __uqsub16(p0, v); |
||||
p1 = __uqsub16(p1, v); |
||||
p2 = __uqsub16(p2, v); |
||||
p3 = __uqsub16(p3, v); |
||||
*((uint16x2_t *)(table)) = p0; |
||||
*((uint16x2_t *)(table+2)) = p1; |
||||
*((uint16x2_t *)(table+4)) = p2; |
||||
*((uint16x2_t *)(table+6)) = p3; |
||||
table += 8; |
||||
} while (--n); |
||||
} |
||||
|
||||
Z_INTERNAL void slide_hash_armv6(deflate_state *s) { |
||||
unsigned int wsize = s->w_size; |
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize); |
||||
slide_hash_chain(s->prev, wsize, wsize); |
||||
} |
||||
#endif |
@ -0,0 +1,46 @@ |
||||
/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
|
||||
* Copyright (C) 2017-2020 Mika T. Lindqvist |
||||
* |
||||
* Authors: |
||||
* Mika T. Lindqvist <postmaster@raasu.org> |
||||
* Jun He <jun.he@arm.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef ARM_NEON |
||||
#include "neon_intrins.h" |
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
/* SIMD version of hash_chain rebase */ |
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { |
||||
Z_REGISTER uint16x8_t v; |
||||
uint16x8x4_t p0, p1; |
||||
Z_REGISTER size_t n; |
||||
|
||||
size_t size = entries*sizeof(table[0]); |
||||
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); |
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size"); |
||||
v = vdupq_n_u16(wsize); |
||||
|
||||
n = size / (sizeof(uint16x8_t) * 8); |
||||
do { |
||||
p0 = vld1q_u16_x4(table); |
||||
p1 = vld1q_u16_x4(table+32); |
||||
vqsubq_u16_x4_x1(p0, p0, v); |
||||
vqsubq_u16_x4_x1(p1, p1, v); |
||||
vst1q_u16_x4(table, p0); |
||||
vst1q_u16_x4(table+32, p1); |
||||
table += 64; |
||||
} while (--n); |
||||
} |
||||
|
||||
Z_INTERNAL void slide_hash_neon(deflate_state *s) { |
||||
unsigned int wsize = s->w_size; |
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize); |
||||
slide_hash_chain(s->prev, wsize, wsize); |
||||
} |
||||
#endif |
@ -0,0 +1,24 @@ |
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: |
||||
|
||||
|
||||
mostlyclean: clean |
||||
clean: |
||||
rm -f *.o *.lo *~ \
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean |
||||
rm -f Makefile
|
@ -0,0 +1,53 @@ |
||||
/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef CHUNK_PERMUTE_TABLE_H_ |
||||
#define CHUNK_PERMUTE_TABLE_H_ |
||||
|
||||
#include "zbuild.h" |
||||
|
||||
/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */ |
||||
static const ALIGNED_(32) uint8_t permute_table[26*32] = { |
||||
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */ |
||||
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */ |
||||
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */ |
||||
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */ |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */ |
||||
|
||||
/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
|
||||
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual |
||||
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity, |
||||
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but, |
||||
* this is what we're dealt. |
||||
*/ |
||||
|
||||
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */ |
||||
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */ |
||||
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */ |
||||
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */ |
||||
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */ |
||||
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */ |
||||
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */ |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */ |
||||
}; |
||||
|
||||
typedef struct lut_rem_pair_s { |
||||
uint16_t idx; |
||||
uint16_t remval; |
||||
} lut_rem_pair; |
||||
|
||||
#endif |
@ -0,0 +1,93 @@ |
||||
# Makefile for POWER-specific files
|
||||
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
P8FLAGS=-mcpu=power8
|
||||
P9FLAGS=-mcpu=power9
|
||||
PPCFLAGS=-maltivec
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: power_features.o \
|
||||
power_features.lo \
|
||||
adler32_power8.o \
|
||||
adler32_power8.lo \
|
||||
adler32_vmx.o \
|
||||
adler32_vmx.lo \
|
||||
chunkset_power8.o \
|
||||
chunkset_power8.lo \
|
||||
compare256_power9.o \
|
||||
compare256_power9.lo \
|
||||
crc32_power8.o \
|
||||
crc32_power8.lo \
|
||||
slide_hash_power8.o \
|
||||
slide_hash_power8.lo \
|
||||
slide_hash_vmx.o \
|
||||
slide_hash_vmx.lo
|
||||
|
||||
power_features.o: |
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
power_features.lo: |
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
adler32_power8.o: |
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_power8.lo: |
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_vmx.o: |
||||
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
adler32_vmx.lo: |
||||
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
chunkset_power8.o: |
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
chunkset_power8.lo: |
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
compare256_power9.o: |
||||
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
compare256_power9.lo: |
||||
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
crc32_power8.o: |
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
crc32_power8.lo: |
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
slide_hash_power8.o: |
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_power8.lo: |
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_vmx.o: |
||||
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
slide_hash_vmx.lo: |
||||
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
mostlyclean: clean |
||||
clean: |
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean |
||||
rm -f Makefile
|
@ -0,0 +1,153 @@ |
||||
/* Adler32 for POWER8 using VSX instructions.
|
||||
* Copyright (C) 2020 IBM Corporation |
||||
* Author: Rogerio Alves <rcardoso@linux.ibm.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector) |
||||
* instructions. |
||||
* |
||||
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means |
||||
* iteration n) is the initial value of adler - at start _0 is 1 unless |
||||
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after |
||||
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on. |
||||
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on |
||||
* after iteration N. |
||||
* |
||||
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] + |
||||
* N-1*c[1] + ... + c[N] |
||||
* |
||||
* In a more general way: |
||||
* |
||||
* s1_N = s1_0 + sum(i=1 to N)c[i] |
||||
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i] |
||||
* |
||||
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we |
||||
* can process N-bit at time we can do this at once. |
||||
* |
||||
* Since VSX can support 16-bit vector instructions, we can process |
||||
* 16-bit at time using N = 16 we have: |
||||
* |
||||
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i] |
||||
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i] |
||||
* |
||||
* After the first iteration we calculate the adler32 checksum for 16 bytes. |
||||
* |
||||
* For more background about adler32 please check the RFC: |
||||
* https://www.ietf.org/rfc/rfc1950.txt
|
||||
*/ |
||||
|
||||
#ifdef POWER8_VSX |
||||
|
||||
#include <altivec.h> |
||||
#include "zbuild.h" |
||||
#include "adler32_p.h" |
||||
|
||||
/* Vector across sum unsigned int (saturate). */ |
||||
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) { |
||||
__b = vec_sld(__a, __a, 8); |
||||
__b = vec_add(__b, __a); |
||||
__a = vec_sld(__b, __b, 4); |
||||
__a = vec_add(__a, __b); |
||||
|
||||
return __a; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
uint32_t s1 = adler & 0xffff; |
||||
uint32_t s2 = (adler >> 16) & 0xffff; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (UNLIKELY(len == 1)) |
||||
return adler32_len_1(s1, buf, s2); |
||||
|
||||
/* If buffer is empty or len=0 we need to return adler initial value. */ |
||||
if (UNLIKELY(buf == NULL)) |
||||
return 1; |
||||
|
||||
/* This is faster than VSX code for len < 64. */ |
||||
if (len < 64) |
||||
return adler32_len_64(s1, buf, len, s2); |
||||
|
||||
/* Use POWER VSX instructions for len >= 64. */ |
||||
const vector unsigned int v_zeros = { 0 }; |
||||
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, |
||||
6, 5, 4, 3, 2, 1}; |
||||
const vector unsigned char vsh = vec_splat_u8(4); |
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; |
||||
vector unsigned int vs1 = { 0 }; |
||||
vector unsigned int vs2 = { 0 }; |
||||
vector unsigned int vs1_save = { 0 }; |
||||
vector unsigned int vsum1, vsum2; |
||||
vector unsigned char vbuf; |
||||
int n; |
||||
|
||||
vs1[0] = s1; |
||||
vs2[0] = s2; |
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */ |
||||
while (len >= NMAX) { |
||||
len -= NMAX; |
||||
n = NMAX / 16; |
||||
do { |
||||
vbuf = vec_xl(0, (unsigned char *) buf); |
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ |
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */ |
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros); |
||||
/* Save vs1. */ |
||||
vs1_save = vec_add(vs1_save, vs1); |
||||
/* Accumulate the sums. */ |
||||
vs1 = vec_add(vsum1, vs1); |
||||
vs2 = vec_add(vsum2, vs2); |
||||
|
||||
buf += 16; |
||||
} while (--n); |
||||
/* Once each block of NMAX size. */ |
||||
vs1 = vec_sumsu(vs1, vsum1); |
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ |
||||
vs2 = vec_add(vs1_save, vs2); |
||||
vs2 = vec_sumsu(vs2, vsum2); |
||||
|
||||
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */ |
||||
vs1[0] = vs1[0] % BASE; |
||||
/* vs2[0] = s2_i + 16*s1_save +
|
||||
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */ |
||||
vs2[0] = vs2[0] % BASE; |
||||
|
||||
vs1 = vec_and(vs1, vmask); |
||||
vs2 = vec_and(vs2, vmask); |
||||
vs1_save = v_zeros; |
||||
} |
||||
|
||||
/* len is less than NMAX one modulo is needed. */ |
||||
if (len >= 16) { |
||||
while (len >= 16) { |
||||
len -= 16; |
||||
|
||||
vbuf = vec_xl(0, (unsigned char *) buf); |
||||
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ |
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */ |
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros); |
||||
/* Save vs1. */ |
||||
vs1_save = vec_add(vs1_save, vs1); |
||||
/* Accumulate the sums. */ |
||||
vs1 = vec_add(vsum1, vs1); |
||||
vs2 = vec_add(vsum2, vs2); |
||||
|
||||
buf += 16; |
||||
} |
||||
/* Since the size will be always less than NMAX we do this once. */ |
||||
vs1 = vec_sumsu(vs1, vsum1); |
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ |
||||
vs2 = vec_add(vs1_save, vs2); |
||||
vs2 = vec_sumsu(vs2, vsum2); |
||||
} |
||||
/* Copy result back to s1, s2 (mod 65521). */ |
||||
s1 = vs1[0] % BASE; |
||||
s2 = vs2[0] % BASE; |
||||
|
||||
/* Process tail (len < 16). */ |
||||
return adler32_len_16(s1, buf, len, s2); |
||||
} |
||||
|
||||
#endif /* POWER8_VSX */ |
@ -0,0 +1,186 @@ |
||||
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org> |
||||
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef PPC_VMX |
||||
#include <altivec.h> |
||||
#include "zbuild.h" |
||||
#include "zendian.h" |
||||
#include "adler32_p.h" |
||||
|
||||
#define vmx_zero() (vec_splat_u32(0)) |
||||
|
||||
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) { |
||||
unsigned int i; |
||||
for (i = 0; i < len; ++i) { |
||||
pair[0] += buf[i]; |
||||
pair[1] += pair[0]; |
||||
} |
||||
} |
||||
|
||||
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) { |
||||
/* Different taps for the separable components of sums */ |
||||
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49}; |
||||
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33}; |
||||
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17}; |
||||
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; |
||||
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
|
||||
* a 2 element vector from a single load + a subsequent shift is just barely faster |
||||
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */ |
||||
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; |
||||
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2)); |
||||
vector unsigned int adacc, s2acc; |
||||
vector unsigned int pair_vec = vec_ld(0, s); |
||||
adacc = vec_perm(pair_vec, pair_vec, s0_perm); |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
s2acc = vec_sro(pair_vec, shift_vec); |
||||
#else |
||||
s2acc = vec_slo(pair_vec, shift_vec); |
||||
#endif |
||||
|
||||
vector unsigned int zero = vmx_zero(); |
||||
vector unsigned int s3acc = zero; |
||||
vector unsigned int s3acc_0 = zero; |
||||
vector unsigned int adacc_prev = adacc; |
||||
vector unsigned int adacc_prev_0 = zero; |
||||
|
||||
vector unsigned int s2acc_0 = zero; |
||||
vector unsigned int s2acc_1 = zero; |
||||
vector unsigned int s2acc_2 = zero; |
||||
|
||||
/* Maintain a running sum of a second half, this might help use break yet another
|
||||
* data dependency bubble in the sum */ |
||||
vector unsigned int adacc_0 = zero; |
||||
|
||||
int num_iter = len / 4; |
||||
int rem = len & 3; |
||||
|
||||
for (int i = 0; i < num_iter; ++i) { |
||||
vector unsigned char d0 = vec_ld(0, buf); |
||||
vector unsigned char d1 = vec_ld(16, buf); |
||||
vector unsigned char d2 = vec_ld(32, buf); |
||||
vector unsigned char d3 = vec_ld(48, buf); |
||||
|
||||
/* The core operation of the loop, basically
|
||||
* what is being unrolled below */ |
||||
adacc = vec_sum4s(d0, adacc); |
||||
s3acc = vec_add(s3acc, adacc_prev); |
||||
s3acc_0 = vec_add(s3acc_0, adacc_prev_0); |
||||
s2acc = vec_msum(t0, d0, s2acc); |
||||
|
||||
/* interleave dependent sums in here */ |
||||
adacc_0 = vec_sum4s(d1, adacc_0); |
||||
s2acc_0 = vec_msum(t1, d1, s2acc_0); |
||||
adacc = vec_sum4s(d2, adacc); |
||||
s2acc_1 = vec_msum(t2, d2, s2acc_1); |
||||
s2acc_2 = vec_msum(t3, d3, s2acc_2); |
||||
adacc_0 = vec_sum4s(d3, adacc_0); |
||||
|
||||
adacc_prev = adacc; |
||||
adacc_prev_0 = adacc_0; |
||||
buf += 64; |
||||
} |
||||
|
||||
adacc = vec_add(adacc, adacc_0); |
||||
s3acc = vec_add(s3acc, s3acc_0); |
||||
s3acc = vec_sl(s3acc, vec_splat_u32(6)); |
||||
|
||||
if (rem) { |
||||
adacc_prev = vec_add(adacc_prev_0, adacc_prev); |
||||
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4)); |
||||
while (rem--) { |
||||
vector unsigned char d0 = vec_ld(0, buf); |
||||
adacc = vec_sum4s(d0, adacc); |
||||
s3acc = vec_add(s3acc, adacc_prev); |
||||
s2acc = vec_msum(t3, d0, s2acc); |
||||
adacc_prev = vec_sl(adacc, vec_splat_u32(4)); |
||||
buf += 16; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Sum up independent second sums */ |
||||
s2acc = vec_add(s2acc, s2acc_0); |
||||
s2acc_2 = vec_add(s2acc_1, s2acc_2); |
||||
s2acc = vec_add(s2acc, s2acc_2); |
||||
|
||||
s2acc = vec_add(s2acc, s3acc); |
||||
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8)); |
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8)); |
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4)); |
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4)); |
||||
|
||||
vec_ste(adacc, 0, s); |
||||
vec_ste(s2acc, 0, s+1); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
uint32_t sum2; |
||||
uint32_t pair[16] ALIGNED_(16); |
||||
memset(&pair[2], 0, 14); |
||||
int n = NMAX; |
||||
unsigned int done = 0, i; |
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file). |
||||
*/ |
||||
sum2 = (adler >> 16) & 0xffff; |
||||
adler &= 0xffff; |
||||
pair[0] = adler; |
||||
pair[1] = sum2; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (UNLIKELY(len == 1)) |
||||
return adler32_len_1(adler, buf, sum2); |
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */ |
||||
if (UNLIKELY(buf == NULL)) |
||||
return 1L; |
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */ |
||||
if (UNLIKELY(len < 16)) |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
|
||||
// Align buffer
|
||||
unsigned int al = 0; |
||||
if ((uintptr_t)buf & 0xf) { |
||||
al = 16-((uintptr_t)buf & 0xf); |
||||
if (al > len) { |
||||
al=len; |
||||
} |
||||
vmx_handle_head_or_tail(pair, buf, al); |
||||
|
||||
done += al; |
||||
/* Rather than rebasing, we can reduce the max sums for the
|
||||
* first round only */ |
||||
n -= al; |
||||
} |
||||
for (i = al; i < len; i += n) { |
||||
int remaining = (int)(len-i); |
||||
n = MIN(remaining, (i == al) ? n : NMAX); |
||||
|
||||
if (n < 16) |
||||
break; |
||||
|
||||
vmx_accum32(pair, buf + i, n / 16); |
||||
pair[0] %= BASE; |
||||
pair[1] %= BASE; |
||||
|
||||
done += (n / 16) * 16; |
||||
} |
||||
|
||||
/* Handle the tail elements. */ |
||||
if (done < len) { |
||||
vmx_handle_head_or_tail(pair, (buf + done), len - done); |
||||
pair[0] %= BASE; |
||||
pair[1] %= BASE; |
||||
} |
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ |
||||
return (pair[1] << 16) | pair[0]; |
||||
} |
||||
#endif |
@ -0,0 +1,55 @@ |
||||
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef POWER8_VSX |
||||
#include <altivec.h> |
||||
#include "../../zbuild.h" |
||||
|
||||
typedef vector unsigned char chunk_t; |
||||
|
||||
#define CHUNK_SIZE 16 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
uint16_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = (vector unsigned char)vec_splats(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
uint32_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = (vector unsigned char)vec_splats(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
uint64_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = (vector unsigned char)vec_splats((unsigned long long)tmp); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
*chunk = vec_xl(0, s); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
vec_xst(*chunk, 0, out); |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_power8 |
||||
#define CHUNKCOPY chunkcopy_power8 |
||||
#define CHUNKUNROLL chunkunroll_power8 |
||||
#define CHUNKMEMSET chunkmemset_power8 |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8 |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_power8 |
||||
|
||||
#include "inffast_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,64 @@ |
||||
/* compare256_power9.c - Power9 version of compare256
|
||||
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef POWER9 |
||||
#include <altivec.h> |
||||
#include "../../zbuild.h" |
||||
#include "../../zendian.h" |
||||
|
||||
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
|
||||
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
|
||||
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12) |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc) |
||||
#else |
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc) |
||||
#endif |
||||
#else |
||||
# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc) |
||||
#endif |
||||
|
||||
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0, cmplen; |
||||
|
||||
do { |
||||
vector unsigned char vsrc0, vsrc1, vc; |
||||
|
||||
vsrc0 = *((vector unsigned char *)src0); |
||||
vsrc1 = *((vector unsigned char *)src1); |
||||
|
||||
/* Compare 16 bytes at a time. Each byte of vc will be either
|
||||
* all ones or all zeroes, depending on the result of the comparison. */ |
||||
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1); |
||||
|
||||
/* Since the index of matching bytes will contain only zeroes
|
||||
* on vc (since we used cmpne), counting the number of consecutive |
||||
* bytes where LSB == 0 is the same as counting the length of the match. */ |
||||
zng_vec_vctzlsbb(vc, cmplen); |
||||
if (cmplen != 16) |
||||
return len + cmplen; |
||||
|
||||
src0 += 16, src1 += 16, len += 16; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_power9_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_power9 |
||||
#define COMPARE256 compare256_power9_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_power9 |
||||
#define COMPARE256 compare256_power9_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,589 @@ |
||||
/* crc32 for POWER8 using VSX instructions
|
||||
* Copyright (C) 2021 IBM Corporation |
||||
* |
||||
* Author: Rogerio Alves <rogealve@br.ibm.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of |
||||
* 16 bytes. |
||||
* |
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel |
||||
* chunks in order to mask the latency of the vpmsum instructions. If we |
||||
* have more than 32 kB of data to checksum we repeat this step multiple |
||||
* times, passing in the previous 1024 bits. |
||||
* |
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds |
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just |
||||
* calculate constants that land the data in this 32 bits. |
||||
* |
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2) |
||||
* for n = CRC using POWER8 instructions. We use x = 32. |
||||
* |
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
* |
||||
* This code uses gcc vector builtins instead using assembly directly. |
||||
*/ |
||||
|
||||
#include <altivec.h> |
||||
#include "zendian.h" |
||||
#include "zbuild.h" |
||||
|
||||
#include "crc32_constants.h" |
||||
#include "crc32_braid_tbl.h" |
||||
|
||||
#if defined (__clang__) |
||||
#include "fallback_builtins.h" |
||||
#endif |
||||
|
||||
#define MAX_SIZE 32768 |
||||
#define VMX_ALIGN 16 |
||||
#define VMX_ALIGN_MASK (VMX_ALIGN-1) |
||||
|
||||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) { |
||||
while (len--) |
||||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); |
||||
return crc; |
||||
} |
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); |
||||
|
||||
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) { |
||||
unsigned int prealign; |
||||
unsigned int tail; |
||||
|
||||
unsigned long len = (unsigned long) _len; |
||||
|
||||
if (p == (const unsigned char *) 0x0) |
||||
return 0; |
||||
|
||||
crc ^= 0xffffffff; |
||||
|
||||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) { |
||||
crc = crc32_align(crc, p, len); |
||||
goto out; |
||||
} |
||||
|
||||
if ((unsigned long)p & VMX_ALIGN_MASK) { |
||||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); |
||||
crc = crc32_align(crc, p, prealign); |
||||
len -= prealign; |
||||
p += prealign; |
||||
} |
||||
|
||||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); |
||||
|
||||
tail = len & VMX_ALIGN_MASK; |
||||
if (tail) { |
||||
p += len & ~VMX_ALIGN_MASK; |
||||
crc = crc32_align(crc, p, tail); |
||||
} |
||||
|
||||
out: |
||||
crc ^= 0xffffffff; |
||||
|
||||
return crc; |
||||
} |
||||
|
||||
/* When we have a load-store in a single-dispatch group and address overlap
|
||||
* such that forward is not allowed (load-hit-store) the group must be flushed. |
||||
* A group ending NOP prevents the flush. |
||||
*/ |
||||
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory") |
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN |
||||
#define BYTESWAP_DATA |
||||
#endif |
||||
|
||||
#ifdef BYTESWAP_DATA |
||||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc) |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
/* Byte reverse permute constant LE. */ |
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL }; |
||||
#else |
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL }; |
||||
#endif |
||||
#else |
||||
#define VEC_PERM(vr, va, vb, vc) |
||||
#endif |
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { |
||||
|
||||
const __vector unsigned long long vzero = {0,0}; |
||||
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL}; |
||||
|
||||
const __vector unsigned long long vmask_32bit = |
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4); |
||||
|
||||
const __vector unsigned long long vmask_64bit = |
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8); |
||||
|
||||
__vector unsigned long long vcrc; |
||||
|
||||
__vector unsigned long long vconst1, vconst2; |
||||
|
||||
/* vdata0-vdata7 will contain our data (p). */ |
||||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7; |
||||
|
||||
/* v0-v7 will contain our checksums */ |
||||
__vector unsigned long long v0 = {0,0}; |
||||
__vector unsigned long long v1 = {0,0}; |
||||
__vector unsigned long long v2 = {0,0}; |
||||
__vector unsigned long long v3 = {0,0}; |
||||
__vector unsigned long long v4 = {0,0}; |
||||
__vector unsigned long long v5 = {0,0}; |
||||
__vector unsigned long long v6 = {0,0}; |
||||
__vector unsigned long long v7 = {0,0}; |
||||
|
||||
|
||||
/* Vector auxiliary variables. */ |
||||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; |
||||
|
||||
unsigned int offset; /* Constant table offset. */ |
||||
|
||||
unsigned long i; /* Counter. */ |
||||
unsigned long chunks; |
||||
|
||||
unsigned long block_size; |
||||
int next_block = 0; |
||||
|
||||
/* Align by 128 bits. The last 128 bit block will be processed at end. */ |
||||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; |
||||
|
||||
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc); |
||||
|
||||
/* Short version. */ |
||||
if (len < 256) { |
||||
/* Calculate where in the constant table we need to start. */ |
||||
offset = 256 - len; |
||||
|
||||
vconst1 = vec_ld(offset, vcrc_short_const); |
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
||||
|
||||
/* xor initial value */ |
||||
vdata0 = vec_xor(vdata0, vcrc); |
||||
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
||||
v0 = vec_xor(v0, vdata0); |
||||
|
||||
for (i = 16; i < len; i += 16) { |
||||
vconst1 = vec_ld(offset + i, vcrc_short_const); |
||||
vdata0 = vec_ld(i, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const); |
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1); |
||||
v0 = vec_xor(v0, vdata0); |
||||
} |
||||
} else { |
||||
|
||||
/* Load initial values. */ |
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
||||
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
||||
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
||||
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
||||
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
||||
|
||||
/* xor in initial value */ |
||||
vdata0 = vec_xor(vdata0, vcrc); |
||||
|
||||
p = (char *)p + 128; |
||||
|
||||
do { |
||||
/* Checksum in blocks of MAX_SIZE. */ |
||||
block_size = length; |
||||
if (block_size > MAX_SIZE) { |
||||
block_size = MAX_SIZE; |
||||
} |
||||
|
||||
length = length - block_size; |
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each |
||||
* constant is 16 bytes, and it is used against 128 bytes of input |
||||
* data - 128 / 16 = 8 |
||||
*/ |
||||
offset = (MAX_SIZE/8) - (block_size/8); |
||||
/* We reduce our final 128 bytes in a separate step */ |
||||
chunks = (block_size/128)-1; |
||||
|
||||
vconst1 = vec_ld(offset, vcrc_const); |
||||
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, |
||||
(__vector unsigned long long)vconst1); |
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, |
||||
(__vector unsigned long long)vconst1); |
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2, |
||||
(__vector unsigned long long)vconst1); |
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, |
||||
(__vector unsigned long long)vconst1); |
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, |
||||
(__vector unsigned long long)vconst1); |
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, |
||||
(__vector unsigned long long)vconst1); |
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, |
||||
(__vector unsigned long long)vconst1); |
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, |
||||
(__vector unsigned long long)vconst1); |
||||
|
||||
if (chunks > 1) { |
||||
offset += 16; |
||||
vconst2 = vec_ld(offset, vcrc_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
||||
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
||||
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
||||
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
||||
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
||||
|
||||
p = (char *)p + 128; |
||||
|
||||
/*
|
||||
* main loop. Each iteration calculates the CRC for a 128-byte |
||||
* block. |
||||
*/ |
||||
for (i = 0; i < chunks-2; i++) { |
||||
vconst1 = vec_ld(offset, vcrc_const); |
||||
offset += 16; |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v0 = vec_xor(v0, va0); |
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, |
||||
(__vector unsigned long long)vconst2); |
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v1 = vec_xor(v1, va1); |
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, |
||||
(__vector unsigned long long)vconst2); |
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v2 = vec_xor(v2, va2); |
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long) |
||||
vdata2, (__vector unsigned long long)vconst2); |
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v3 = vec_xor(v3, va3); |
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, |
||||
(__vector unsigned long long)vconst2); |
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const); |
||||
|
||||
vconst2 = vec_ld(offset, vcrc_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v4 = vec_xor(v4, va4); |
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, |
||||
(__vector unsigned long long)vconst1); |
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v5 = vec_xor(v5, va5); |
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, |
||||
(__vector unsigned long long)vconst1); |
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v6 = vec_xor(v6, va6); |
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, |
||||
(__vector unsigned long long)vconst1); |
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v7 = vec_xor(v7, va7); |
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, |
||||
(__vector unsigned long long)vconst1); |
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p); |
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const); |
||||
|
||||
p = (char *)p + 128; |
||||
} |
||||
|
||||
/* First cool down */ |
||||
vconst1 = vec_ld(offset, vcrc_const); |
||||
offset += 16; |
||||
|
||||
v0 = vec_xor(v0, va0); |
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v1 = vec_xor(v1, va1); |
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v2 = vec_xor(v2, va2); |
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v3 = vec_xor(v3, va3); |
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v4 = vec_xor(v4, va4); |
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v5 = vec_xor(v5, va5); |
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v6 = vec_xor(v6, va6); |
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, |
||||
(__vector unsigned long long)vconst1); |
||||
GROUP_ENDING_NOP; |
||||
|
||||
v7 = vec_xor(v7, va7); |
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, |
||||
(__vector unsigned long long)vconst1); |
||||
}/* else */ |
||||
|
||||
/* Second cool down. */ |
||||
v0 = vec_xor(v0, va0); |
||||
v1 = vec_xor(v1, va1); |
||||
v2 = vec_xor(v2, va2); |
||||
v3 = vec_xor(v3, va3); |
||||
v4 = vec_xor(v4, va4); |
||||
v5 = vec_xor(v5, va5); |
||||
v6 = vec_xor(v6, va6); |
||||
v7 = vec_xor(v7, va7); |
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits |
||||
* of the register. Since we are bit reflected we have to shift it |
||||
* left 32 bits so it occupies the least significant bits in the |
||||
* bit reflected domain. |
||||
*/ |
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
||||
(__vector unsigned char)vzero, 4); |
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, |
||||
(__vector unsigned char)vzero, 4); |
||||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, |
||||
(__vector unsigned char)vzero, 4); |
||||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, |
||||
(__vector unsigned char)vzero, 4); |
||||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, |
||||
(__vector unsigned char)vzero, 4); |
||||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, |
||||
(__vector unsigned char)vzero, 4); |
||||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, |
||||
(__vector unsigned char)vzero, 4); |
||||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, |
||||
(__vector unsigned char)vzero, 4); |
||||
|
||||
/* xor with the last 1024 bits. */ |
||||
va0 = vec_ld(0, (__vector unsigned long long*) p); |
||||
VEC_PERM(va0, va0, va0, vperm_const); |
||||
|
||||
va1 = vec_ld(16, (__vector unsigned long long*) p); |
||||
VEC_PERM(va1, va1, va1, vperm_const); |
||||
|
||||
va2 = vec_ld(32, (__vector unsigned long long*) p); |
||||
VEC_PERM(va2, va2, va2, vperm_const); |
||||
|
||||
va3 = vec_ld(48, (__vector unsigned long long*) p); |
||||
VEC_PERM(va3, va3, va3, vperm_const); |
||||
|
||||
va4 = vec_ld(64, (__vector unsigned long long*) p); |
||||
VEC_PERM(va4, va4, va4, vperm_const); |
||||
|
||||
va5 = vec_ld(80, (__vector unsigned long long*) p); |
||||
VEC_PERM(va5, va5, va5, vperm_const); |
||||
|
||||
va6 = vec_ld(96, (__vector unsigned long long*) p); |
||||
VEC_PERM(va6, va6, va6, vperm_const); |
||||
|
||||
va7 = vec_ld(112, (__vector unsigned long long*) p); |
||||
VEC_PERM(va7, va7, va7, vperm_const); |
||||
|
||||
p = (char *)p + 128; |
||||
|
||||
vdata0 = vec_xor(v0, va0); |
||||
vdata1 = vec_xor(v1, va1); |
||||
vdata2 = vec_xor(v2, va2); |
||||
vdata3 = vec_xor(v3, va3); |
||||
vdata4 = vec_xor(v4, va4); |
||||
vdata5 = vec_xor(v5, va5); |
||||
vdata6 = vec_xor(v6, va6); |
||||
vdata7 = vec_xor(v7, va7); |
||||
|
||||
/* Check if we have more blocks to process */ |
||||
next_block = 0; |
||||
if (length != 0) { |
||||
next_block = 1; |
||||
|
||||
/* zero v0-v7 */ |
||||
v0 = vec_xor(v0, v0); |
||||
v1 = vec_xor(v1, v1); |
||||
v2 = vec_xor(v2, v2); |
||||
v3 = vec_xor(v3, v3); |
||||
v4 = vec_xor(v4, v4); |
||||
v5 = vec_xor(v5, v5); |
||||
v6 = vec_xor(v6, v6); |
||||
v7 = vec_xor(v7, v7); |
||||
} |
||||
length = length + 128; |
||||
|
||||
} while (next_block); |
||||
|
||||
/* Calculate how many bytes we have left. */ |
||||
length = (len & 127); |
||||
|
||||
/* Calculate where in (short) constant table we need to start. */ |
||||
offset = 128 - length; |
||||
|
||||
v0 = vec_ld(offset, vcrc_short_const); |
||||
v1 = vec_ld(offset + 16, vcrc_short_const); |
||||
v2 = vec_ld(offset + 32, vcrc_short_const); |
||||
v3 = vec_ld(offset + 48, vcrc_short_const); |
||||
v4 = vec_ld(offset + 64, vcrc_short_const); |
||||
v5 = vec_ld(offset + 80, vcrc_short_const); |
||||
v6 = vec_ld(offset + 96, vcrc_short_const); |
||||
v7 = vec_ld(offset + 112, vcrc_short_const); |
||||
|
||||
offset += 128; |
||||
|
||||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata0, (__vector unsigned int)v0); |
||||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata1, (__vector unsigned int)v1); |
||||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata2, (__vector unsigned int)v2); |
||||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata3, (__vector unsigned int)v3); |
||||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata4, (__vector unsigned int)v4); |
||||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata5, (__vector unsigned int)v5); |
||||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata6, (__vector unsigned int)v6); |
||||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata7, (__vector unsigned int)v7); |
||||
|
||||
/* Now reduce the tail (0-112 bytes). */ |
||||
for (i = 0; i < length; i+=16) { |
||||
vdata0 = vec_ld(i,(__vector unsigned long long*)p); |
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const); |
||||
va0 = vec_ld(offset + i,vcrc_short_const); |
||||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw( |
||||
(__vector unsigned int)vdata0, (__vector unsigned int)va0); |
||||
v0 = vec_xor(v0, va0); |
||||
} |
||||
|
||||
/* xor all parallel chunks together. */ |
||||
v0 = vec_xor(v0, v1); |
||||
v2 = vec_xor(v2, v3); |
||||
v4 = vec_xor(v4, v5); |
||||
v6 = vec_xor(v6, v7); |
||||
|
||||
v0 = vec_xor(v0, v2); |
||||
v4 = vec_xor(v4, v6); |
||||
|
||||
v0 = vec_xor(v0, v4); |
||||
} |
||||
|
||||
/* Barrett Reduction */ |
||||
vconst1 = vec_ld(0, v_Barrett_const); |
||||
vconst2 = vec_ld(16, v_Barrett_const); |
||||
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
||||
(__vector unsigned char)v0, 8); |
||||
v0 = vec_xor(v1,v0); |
||||
|
||||
/* shift left one bit */ |
||||
__vector unsigned char vsht_splat = vec_splat_u8 (1); |
||||
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat); |
||||
|
||||
v0 = vec_and(v0, vmask_64bit); |
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit |
||||
* reflecting our data (which is expensive to do), we bit reflect our |
||||
* constants and our algorithm, which means the intermediate data in |
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect |
||||
* the algorithm because we don't carry in mod 2 arithmetic. |
||||
*/ |
||||
|
||||
/* bottom 32 bits of a */ |
||||
v1 = vec_and(v0, vmask_32bit); |
||||
|
||||
/* ma */ |
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1, |
||||
(__vector unsigned long long)vconst1); |
||||
|
||||
/* bottom 32bits of ma */ |
||||
v1 = vec_and(v1, vmask_32bit); |
||||
/* qn */ |
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1, |
||||
(__vector unsigned long long)vconst2); |
||||
/* a - qn, subtraction is xor in GF(2) */ |
||||
v0 = vec_xor (v0, v1); |
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in |
||||
* the high 32 bits. We just need to shift it left 4 bytes |
||||
* V0 [ 0 1 X 3 ] |
||||
* V0 [ 0 X 2 3 ] |
||||
*/ |
||||
|
||||
/* shift result into top 64 bits of */ |
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, |
||||
(__vector unsigned char)vzero, 4); |
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN |
||||
return v0[0]; |
||||
#else |
||||
return v0[1]; |
||||
#endif |
||||
} |
@ -0,0 +1,31 @@ |
||||
/* Helper functions to work around issues with clang builtins
|
||||
* Copyright (C) 2021 IBM Corporation |
||||
* |
||||
* Authors: |
||||
* Daniel Black <daniel@linux.vnet.ibm.com> |
||||
* Rogerio Alves <rogealve@br.ibm.com> |
||||
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef POWER_BUILTINS_H |
||||
#define POWER_BUILTINS_H |
||||
|
||||
/*
|
||||
* These stubs fix clang incompatibilities with GCC builtins. |
||||
*/ |
||||
|
||||
#ifndef __builtin_crypto_vpmsumw |
||||
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb |
||||
#endif |
||||
#ifndef __builtin_crypto_vpmsumd |
||||
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb |
||||
#endif |
||||
|
||||
static inline __vector unsigned long long __attribute__((overloadable)) |
||||
vec_ld(int __a, const __vector unsigned long long* __b) { |
||||
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,46 @@ |
||||
/* power_features.c - POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM |
||||
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef HAVE_SYS_AUXV_H |
||||
# include <sys/auxv.h> |
||||
#endif |
||||
#ifdef __FreeBSD__ |
||||
# include <machine/cpu.h> |
||||
#endif |
||||
#include "../../zbuild.h" |
||||
#include "power_features.h" |
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features) { |
||||
#ifdef PPC_FEATURES |
||||
unsigned long hwcap; |
||||
#ifdef __FreeBSD__ |
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); |
||||
#else |
||||
hwcap = getauxval(AT_HWCAP); |
||||
#endif |
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC) |
||||
features->has_altivec = 1; |
||||
#endif |
||||
|
||||
#ifdef POWER_FEATURES |
||||
unsigned long hwcap2; |
||||
#ifdef __FreeBSD__ |
||||
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); |
||||
#else |
||||
hwcap2 = getauxval(AT_HWCAP2); |
||||
#endif |
||||
|
||||
#ifdef POWER8_VSX |
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07) |
||||
features->has_arch_2_07 = 1; |
||||
#endif |
||||
#ifdef POWER9 |
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00) |
||||
features->has_arch_3_00 = 1; |
||||
#endif |
||||
#endif |
||||
} |
@ -0,0 +1,18 @@ |
||||
/* power_features.h -- check for POWER CPU features
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM |
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef POWER_H_ |
||||
#define POWER_H_ |
||||
|
||||
struct power_cpu_features { |
||||
int has_altivec; |
||||
int has_arch_2_07; |
||||
int has_arch_3_00; |
||||
}; |
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features); |
||||
|
||||
#endif /* POWER_H_ */ |
@ -0,0 +1,12 @@ |
||||
/* Optimized slide_hash for POWER processors
|
||||
* Copyright (C) 2019-2020 IBM Corporation |
||||
* Author: Matheus Castanho <msc@linux.ibm.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef POWER8_VSX |
||||
|
||||
#define SLIDE_PPC slide_hash_power8 |
||||
#include "slide_ppc_tpl.h" |
||||
|
||||
#endif /* POWER8_VSX */ |
@ -0,0 +1,10 @@ |
||||
/* Optimized slide_hash for PowerPC processors with VMX instructions
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#ifdef PPC_VMX |
||||
|
||||
#define SLIDE_PPC slide_hash_vmx |
||||
#include "slide_ppc_tpl.h" |
||||
|
||||
#endif /* PPC_VMX */ |
@ -0,0 +1,31 @@ |
||||
/* Optimized slide_hash for PowerPC processors
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include <altivec.h> |
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { |
||||
const vector unsigned short vmx_wsize = vec_splats(wsize); |
||||
Pos *p = table; |
||||
|
||||
do { |
||||
vector unsigned short value, result; |
||||
|
||||
value = vec_ld(0, p); |
||||
result = vec_subs(value, vmx_wsize); |
||||
vec_st(result, 0, p); |
||||
|
||||
p += 8; |
||||
entries -= 8; |
||||
} while (entries > 0); |
||||
} |
||||
|
||||
void Z_INTERNAL SLIDE_PPC(deflate_state *s) { |
||||
uint16_t wsize = s->w_size; |
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize); |
||||
slide_hash_chain(s->prev, wsize, wsize); |
||||
} |
@ -0,0 +1,45 @@ |
||||
# Building RISC-V Target with Cmake # |
||||
|
||||
> **Warning** |
||||
> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer. |
||||
> |
||||
> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu. |
||||
> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it. |
||||
## Prerequisite: Build RISC-V Clang Toolchain and QEMU ## |
||||
|
||||
If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version). |
||||
|
||||
```bash |
||||
./prepare_riscv_toolchain_qemu.sh |
||||
``` |
||||
|
||||
After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`. |
||||
|
||||
`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`. |
||||
`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`. |
||||
|
||||
You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them. |
||||
|
||||
## Cross-Compile for RISC-V Target ## |
||||
|
||||
```bash |
||||
cmake -G Ninja -B ./build-riscv \ |
||||
-D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \ |
||||
-D CMAKE_INSTALL_PREFIX=./build-riscv/install \ |
||||
-D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \ |
||||
-D QEMU_PATH={QEMU_PATH} \ |
||||
. |
||||
|
||||
cmake --build ./build-riscv |
||||
``` |
||||
|
||||
Disable the option if there is no RVV support: |
||||
``` |
||||
-D WITH_RVV=OFF |
||||
``` |
||||
|
||||
## Run Unittests on User Mode QEMU ## |
||||
|
||||
```bash |
||||
cd ./build-riscv && ctest --verbose |
||||
``` |
@ -0,0 +1,132 @@ |
||||
/* adler32_rvv.c - RVV version of adler32
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef RISCV_RVV |
||||
|
||||
#include <riscv_vector.h> |
||||
#include <stdint.h> |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
|
||||
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { |
||||
/* split Adler-32 into component sums */ |
||||
uint32_t sum2 = (adler >> 16) & 0xffff; |
||||
adler &= 0xffff; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (len == 1) { |
||||
if (COPY) memcpy(dst, src, 1); |
||||
return adler32_len_1(adler, src, sum2); |
||||
} |
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */ |
||||
if (src == NULL) |
||||
return 1L; |
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */ |
||||
if (len < 16) { |
||||
if (COPY) memcpy(dst, src, len); |
||||
return adler32_len_16(adler, src, len, sum2); |
||||
} |
||||
|
||||
size_t left = len; |
||||
size_t vl = __riscv_vsetvlmax_e8m1(); |
||||
vl = vl > 256 ? 256 : vl; |
||||
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); |
||||
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); |
||||
vuint16m2_t v_buf16_accu; |
||||
|
||||
/*
|
||||
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. |
||||
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit |
||||
* accumulators to boost performance. |
||||
* |
||||
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when |
||||
* vl > 256 (255 * 256 <= UINT16_MAX). |
||||
* |
||||
* We accumulate 8-bit data into a 16-bit accumulator and then |
||||
* move the data into the 32-bit accumulator at the last iteration. |
||||
*/ |
||||
size_t block_size = (256 / vl) * vl; |
||||
size_t nmax_limit = (NMAX / block_size); |
||||
size_t cnt = 0; |
||||
while (left >= block_size) { |
||||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
||||
size_t subprob = block_size; |
||||
while (subprob > 0) { |
||||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); |
||||
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); |
||||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
||||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
||||
src += vl; |
||||
if (COPY) dst += vl; |
||||
subprob -= vl; |
||||
} |
||||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); |
||||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
||||
left -= block_size; |
||||
/* do modulo once each block of NMAX size */ |
||||
if (++cnt >= nmax_limit) { |
||||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
||||
cnt = 0; |
||||
} |
||||
} |
||||
/* the left len <= 256 now, we can use 16-bit accum safely */ |
||||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
||||
size_t res = left; |
||||
while (left >= vl) { |
||||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); |
||||
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); |
||||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
||||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
||||
src += vl; |
||||
if (COPY) dst += vl; |
||||
left -= vl; |
||||
} |
||||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); |
||||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
||||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
||||
|
||||
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); |
||||
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); |
||||
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); |
||||
|
||||
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); |
||||
|
||||
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); |
||||
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); |
||||
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); |
||||
|
||||
sum2 += (sum2_sum + adler * (len - left)); |
||||
|
||||
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); |
||||
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); |
||||
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); |
||||
|
||||
adler += adler_sum; |
||||
|
||||
while (left--) { |
||||
if (COPY) *dst++ = *src; |
||||
adler += *src++; |
||||
sum2 += adler; |
||||
} |
||||
|
||||
sum2 %= BASE; |
||||
adler %= BASE; |
||||
|
||||
return adler | (sum2 << 16); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
return adler32_rvv_impl(adler, dst, src, len, 1); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
return adler32_rvv_impl(adler, NULL, buf, len, 0); |
||||
} |
||||
|
||||
#endif // RISCV_RVV
|
@ -0,0 +1,121 @@ |
||||
/* chunkset_rvv.c - RVV version of chunkset
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#include <riscv_vector.h> |
||||
#include "zbuild.h" |
||||
|
||||
/*
|
||||
* RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, |
||||
* so we prefer using large size chunk and copy memory as much as possible. |
||||
*/ |
||||
#define CHUNK_SIZE 32 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
|
||||
#define CHUNK_MEMSET_RVV_IMPL(elen) \ |
||||
do { \
|
||||
size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t); \
|
||||
uint##elen##_t val = *(uint##elen##_t*)from; \
|
||||
uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
|
||||
do { \
|
||||
vl = __riscv_vsetvl_e##elen##m4(len); \
|
||||
vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
|
||||
__riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
|
||||
len -= vl; chunk_p += vl; \
|
||||
} while (len > 0); \
|
||||
} while (0) |
||||
|
||||
/* We don't have a 32-byte datatype for RISC-V arch. */ |
||||
typedef struct chunk_s { |
||||
uint64_t data[4]; |
||||
} chunk_t; |
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
CHUNK_MEMSET_RVV_IMPL(16); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
CHUNK_MEMSET_RVV_IMPL(32); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
CHUNK_MEMSET_RVV_IMPL(64); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
memcpy(out, chunk->data, CHUNK_SIZE); |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_rvv |
||||
#define CHUNKCOPY chunkcopy_rvv |
||||
#define CHUNKUNROLL chunkunroll_rvv |
||||
#define CHUNKMEMSET chunkmemset_rvv |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv |
||||
|
||||
#define HAVE_CHUNKCOPY |
||||
|
||||
/*
|
||||
* Assuming that the length is non-zero, and that `from` lags `out` by at least |
||||
* sizeof chunk_t bytes, please see the comments in chunkset_tpl.h. |
||||
* |
||||
* We load/store a single chunk once in the `CHUNKCOPY`. |
||||
* However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, |
||||
* such that, we prefer copy large memory size once to make good use of the the RVV advance. |
||||
*
|
||||
* To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot, |
||||
* but we still copy as much memory as possible for some conditions. |
||||
*
|
||||
* case 1: out - from >= len (no overlap) |
||||
* We can use memcpy to copy `len` size once |
||||
* because the memory layout would be the same. |
||||
* |
||||
* case 2: overlap |
||||
* We copy N chunks using memcpy at once, aiming to achieve our goal:
|
||||
* to copy as much memory as possible. |
||||
*
|
||||
* After using a single memcpy to copy N chunks, we have to use series of |
||||
* loadchunk and storechunk to ensure the result is correct. |
||||
*/ |
||||
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { |
||||
Assert(len > 0, "chunkcopy should never have a length 0"); |
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; |
||||
memcpy(out, from, sizeof(chunk_t)); |
||||
out += align; |
||||
from += align; |
||||
len -= align; |
||||
ptrdiff_t dist = out - from; |
||||
if (dist >= len) { |
||||
memcpy(out, from, len); |
||||
out += len; |
||||
from += len; |
||||
return out; |
||||
} |
||||
if (dist >= sizeof(chunk_t)) { |
||||
dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t); |
||||
memcpy(out, from, dist); |
||||
out += dist; |
||||
from += dist; |
||||
len -= dist; |
||||
} |
||||
while (len > 0) { |
||||
memcpy(out, from, sizeof(chunk_t)); |
||||
out += sizeof(chunk_t); |
||||
from += sizeof(chunk_t); |
||||
len -= sizeof(chunk_t); |
||||
} |
||||
return out; |
||||
} |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_rvv |
||||
|
||||
#include "inffast_tpl.h" |
@ -0,0 +1,47 @@ |
||||
/* compare256_rvv.c - RVV version of compare256
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef RISCV_RVV |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "fallback_builtins.h" |
||||
|
||||
#include <riscv_vector.h> |
||||
|
||||
static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
size_t vl; |
||||
long found_diff; |
||||
do { |
||||
vl = __riscv_vsetvl_e8m4(256 - len); |
||||
vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); |
||||
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); |
||||
vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl); |
||||
found_diff = __riscv_vfirst_m_b2(v_mask, vl); |
||||
if (found_diff >= 0) |
||||
return len + (uint32_t)found_diff; |
||||
src0 += vl, src1 += vl, len += vl; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_rvv_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_rvv |
||||
#define COMPARE256 compare256_rvv_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_rvv |
||||
#define COMPARE256 compare256_rvv_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif // RISCV_RVV
|
@ -0,0 +1,45 @@ |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <sys/auxv.h> |
||||
#include <sys/utsname.h> |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "riscv_features.h" |
||||
|
||||
#define ISA_V_HWCAP (1 << ('v' - 'a')) |
||||
|
||||
int Z_INTERNAL is_kernel_version_greater_or_equal_to_6_5() { |
||||
struct utsname buffer; |
||||
uname(&buffer); |
||||
|
||||
int major, minor; |
||||
if (sscanf(buffer.release, "%d.%d", &major, &minor) != 2) { |
||||
// Something bad with uname()
|
||||
return 0; |
||||
} |
||||
|
||||
if (major > 6 || major == 6 && minor >= 5) |
||||
return 1; |
||||
return 0; |
||||
} |
||||
|
||||
void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *features) { |
||||
#if defined(__riscv_v) && defined(__linux__) |
||||
features->has_rvv = 1; |
||||
#else |
||||
features->has_rvv = 0; |
||||
#endif |
||||
} |
||||
|
||||
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) { |
||||
unsigned long hw_cap = getauxval(AT_HWCAP); |
||||
features->has_rvv = hw_cap & ISA_V_HWCAP; |
||||
} |
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) { |
||||
if (is_kernel_version_greater_or_equal_to_6_5()) |
||||
riscv_check_features_runtime(features); |
||||
else |
||||
riscv_check_features_compile_time(features); |
||||
} |
@ -0,0 +1,18 @@ |
||||
/* riscv_features.h -- check for riscv features.
|
||||
* |
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef RISCV_H_ |
||||
#define RISCV_H_ |
||||
|
||||
struct riscv_cpu_features { |
||||
int has_rvv; |
||||
}; |
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features); |
||||
|
||||
#endif /* RISCV_H_ */ |
@ -0,0 +1,34 @@ |
||||
/* slide_hash_rvv.c - RVV version of slide_hash
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef RISCV_RVV |
||||
|
||||
#include <riscv_vector.h> |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { |
||||
size_t vl; |
||||
while (entries > 0) { |
||||
vl = __riscv_vsetvl_e16m4(entries); |
||||
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl); |
||||
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl); |
||||
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl); |
||||
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl); |
||||
__riscv_vse16_v_u16m4(table, v_tab, vl); |
||||
table += vl, entries -= vl; |
||||
} |
||||
} |
||||
|
||||
Z_INTERNAL void slide_hash_rvv(deflate_state *s) { |
||||
uint16_t wsize = (uint16_t)s->w_size; |
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize); |
||||
slide_hash_chain(s->prev, wsize, wsize); |
||||
} |
||||
|
||||
#endif // RISCV_RVV
|
@ -0,0 +1,147 @@ |
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
|
||||
AVX512VNNIFLAG=-mavx512vnni
|
||||
AVX2FLAG=-mavx2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE42FLAG=-msse4.2
|
||||
PCLMULFLAG=-mpclmul
|
||||
VPCLMULFLAG=-mvpclmulqdq
|
||||
XSAVEFLAG=-mxsave
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
x86_features.o x86_features.lo \
|
||||
adler32_avx2.o adler32_avx2.lo \
|
||||
adler32_avx512.o adler32_avx512.lo \
|
||||
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
|
||||
adler32_sse42.o adler32_sse42.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx2.o chunkset_avx2.lo \
|
||||
chunkset_sse2.o chunkset_sse2.lo \
|
||||
chunkset_ssse3.o chunkset_ssse3.lo \
|
||||
compare256_avx2.o compare256_avx2.lo \
|
||||
compare256_sse2.o compare256_sse2.lo \
|
||||
insert_string_sse42.o insert_string_sse42.lo \
|
||||
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
|
||||
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
|
||||
slide_hash_avx2.o slide_hash_avx2.lo \
|
||||
slide_hash_sse2.o slide_hash_sse2.lo
|
||||
|
||||
x86_features.o: |
||||
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
x86_features.lo: |
||||
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
chunkset_avx2.o: |
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_avx2.lo: |
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_sse2.o: |
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_sse2.lo: |
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_ssse3.o: |
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
chunkset_ssse3.lo: |
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
compare256_avx2.o: |
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_avx2.lo: |
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_sse2.o: |
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
compare256_sse2.lo: |
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
insert_string_sse42.o: |
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
insert_string_sse42.lo: |
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
crc32_pclmulqdq.o: |
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_pclmulqdq.lo: |
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.o: |
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.lo: |
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
slide_hash_avx2.o: |
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_avx2.lo: |
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_sse2.o: |
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
slide_hash_sse2.lo: |
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c |
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c |
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c |
||||
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c |
||||
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c |
||||
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c |
||||
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c |
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c |
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c |
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c |
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
mostlyclean: clean |
||||
clean: |
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean |
||||
rm -f Makefile
|
@ -0,0 +1,154 @@ |
||||
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Copyright (C) 2022 Adam Stylinski |
||||
* Authors: |
||||
* Brian Bockelman <bockelman@gmail.com> |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef X86_AVX2 |
||||
|
||||
#include "../../zbuild.h" |
||||
#include <immintrin.h> |
||||
#include "../../adler32_fold.h" |
||||
#include "../../adler32_p.h" |
||||
#include "adler32_avx2_p.h" |
||||
#include "x86_intrins.h" |
||||
|
||||
#ifdef X86_SSE42 |
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); |
||||
|
||||
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d) |
||||
#define sub32(a, b, c) adler32_ssse3(a, b, c) |
||||
#else |
||||
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1) |
||||
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1) |
||||
#endif |
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { |
||||
if (src == NULL) return 1L; |
||||
if (len == 0) return adler; |
||||
|
||||
uint32_t adler0, adler1; |
||||
adler1 = (adler >> 16) & 0xffff; |
||||
adler0 = adler & 0xffff; |
||||
|
||||
rem_peel: |
||||
if (len < 16) { |
||||
if (COPY) { |
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1); |
||||
} else { |
||||
return adler32_len_16(adler0, src, len, adler1); |
||||
} |
||||
} else if (len < 32) { |
||||
if (COPY) { |
||||
return copy_sub32(adler, dst, src, len); |
||||
} else { |
||||
return sub32(adler, src, len); |
||||
} |
||||
} |
||||
|
||||
__m256i vs1, vs2; |
||||
|
||||
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, |
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); |
||||
const __m256i dot3v = _mm256_set1_epi16(1); |
||||
const __m256i zero = _mm256_setzero_si256(); |
||||
|
||||
while (len >= 32) { |
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); |
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); |
||||
__m256i vs1_0 = vs1; |
||||
__m256i vs3 = _mm256_setzero_si256(); |
||||
|
||||
size_t k = MIN(len, NMAX); |
||||
k -= k % 32; |
||||
len -= k; |
||||
|
||||
while (k >= 32) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) |
||||
*/ |
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)src); |
||||
src += 32; |
||||
k -= 32; |
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
|
||||
|
||||
if (COPY) { |
||||
_mm256_storeu_si256((__m256i*)dst, vbuf); |
||||
dst += 32; |
||||
} |
||||
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad); |
||||
vs3 = _mm256_add_epi32(vs3, vs1_0); |
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
|
||||
vs2 = _mm256_add_epi32(vsum2, vs2); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
/* Defer the multiplication with 32 to outside of the loop */ |
||||
vs3 = _mm256_slli_epi32(vs3, 5); |
||||
vs2 = _mm256_add_epi32(vs2, vs3); |
||||
|
||||
/* The compiler is generating the following sequence for this integer modulus
|
||||
* when done the scalar way, in GPRs: |
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + |
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); |
||||
|
||||
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
|
||||
... |
||||
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
|
||||
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
|
||||
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
|
||||
shr $0x2f,%rsi // shift right by 47
|
||||
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
|
||||
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
|
||||
... |
||||
// repeats for each element with vpextract instructions
|
||||
|
||||
This is tricky with AVX2 for a number of reasons: |
||||
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there |
||||
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate |
||||
back down to 32 bit precision later (there is in AVX512) |
||||
3.) Full width integer multiplications aren't cheap |
||||
|
||||
We can, however, do a relatively cheap sequence for horizontal sums. |
||||
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was |
||||
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but |
||||
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be |
||||
performed on the maximum possible inputs before overflow |
||||
*/ |
||||
|
||||
|
||||
/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
|
||||
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). |
||||
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly |
||||
* what the compiler is doing to avoid integer divisions. */ |
||||
adler0 = partial_hsum256(vs1) % BASE; |
||||
adler1 = hsum256(vs2) % BASE; |
||||
} |
||||
|
||||
adler = adler0 | (adler1 << 16); |
||||
|
||||
if (len) { |
||||
goto rem_peel; |
||||
} |
||||
|
||||
return adler; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { |
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,32 @@ |
||||
/* adler32_avx2_p.h -- adler32 avx2 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef ADLER32_AVX2_P_H_ |
||||
#define ADLER32_AVX2_P_H_ |
||||
|
||||
#if defined(X86_AVX2) || defined(X86_AVX512VNNI) |
||||
|
||||
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ |
||||
static inline uint32_t hsum256(__m256i x) { |
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), |
||||
_mm256_castsi256_si128(x)); |
||||
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); |
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); |
||||
return (uint32_t)_mm_cvtsi128_si32(sum3); |
||||
} |
||||
|
||||
static inline uint32_t partial_hsum256(__m256i x) { |
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros */ |
||||
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); |
||||
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); |
||||
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero); |
||||
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); |
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); |
||||
return (uint32_t)_mm_cvtsi128_si32(sum3); |
||||
} |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,115 @@ |
||||
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Authors: |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* Brian Bockelman <bockelman@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef X86_AVX512 |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
#include "../../adler32_fold.h" |
||||
#include "../../cpu_features.h" |
||||
#include <immintrin.h> |
||||
#include "x86_intrins.h" |
||||
#include "adler32_avx512_p.h" |
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { |
||||
if (src == NULL) return 1L; |
||||
if (len == 0) return adler; |
||||
|
||||
uint32_t adler0, adler1; |
||||
adler1 = (adler >> 16) & 0xffff; |
||||
adler0 = adler & 0xffff; |
||||
|
||||
rem_peel: |
||||
if (len < 64) { |
||||
/* This handles the remaining copies, just call normal adler checksum after this */ |
||||
if (COPY) { |
||||
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len)); |
||||
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src); |
||||
_mm512_mask_storeu_epi8(dst, storemask, copy_vec); |
||||
} |
||||
|
||||
#ifdef X86_AVX2 |
||||
return adler32_avx2(adler, src, len); |
||||
#elif defined(X86_SSSE3) |
||||
return adler32_ssse3(adler, src, len); |
||||
#else |
||||
return adler32_len_16(adler0, src, len, adler1); |
||||
#endif |
||||
} |
||||
|
||||
__m512i vbuf, vs1_0, vs3; |
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, |
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, |
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, |
||||
56, 57, 58, 59, 60, 61, 62, 63, 64); |
||||
const __m512i dot3v = _mm512_set1_epi16(1); |
||||
const __m512i zero = _mm512_setzero_si512(); |
||||
size_t k; |
||||
|
||||
while (len >= 64) { |
||||
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); |
||||
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); |
||||
vs1_0 = vs1; |
||||
vs3 = _mm512_setzero_si512(); |
||||
|
||||
k = MIN(len, NMAX); |
||||
k -= k % 64; |
||||
len -= k; |
||||
|
||||
while (k >= 64) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) |
||||
*/ |
||||
vbuf = _mm512_loadu_si512(src); |
||||
|
||||
if (COPY) { |
||||
_mm512_storeu_si512(dst, vbuf); |
||||
dst += 64; |
||||
} |
||||
|
||||
src += 64; |
||||
k -= 64; |
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); |
||||
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); |
||||
vs1 = _mm512_add_epi32(vs1_sad, vs1); |
||||
vs3 = _mm512_add_epi32(vs3, vs1_0); |
||||
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); |
||||
vs2 = _mm512_add_epi32(vsum2, vs2); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6); |
||||
vs2 = _mm512_add_epi32(vs2, vs3); |
||||
|
||||
adler0 = partial_hsum(vs1) % BASE; |
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE; |
||||
} |
||||
|
||||
adler = adler0 | (adler1 << 16); |
||||
|
||||
/* Process tail (len < 64). */ |
||||
if (len) { |
||||
goto rem_peel; |
||||
} |
||||
|
||||
return adler; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { |
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0); |
||||
} |
||||
|
||||
#endif |
||||
|
@ -0,0 +1,46 @@ |
||||
#ifndef AVX512_FUNCS_H |
||||
#define AVX512_FUNCS_H |
||||
|
||||
#include <immintrin.h> |
||||
#include <stdint.h> |
||||
/* Written because *_add_epi32(a) sets off ubsan */ |
||||
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { |
||||
__m256i a = _mm512_extracti64x4_epi64(x, 1); |
||||
__m256i b = _mm512_extracti64x4_epi64(x, 0); |
||||
|
||||
__m256i a_plus_b = _mm256_add_epi32(a, b); |
||||
__m128i c = _mm256_extracti128_si256(a_plus_b, 1); |
||||
__m128i d = _mm256_extracti128_si256(a_plus_b, 0); |
||||
__m128i c_plus_d = _mm_add_epi32(c, d); |
||||
|
||||
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d); |
||||
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d); |
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); |
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3); |
||||
|
||||
return _mm_cvtsi128_si32(sum4); |
||||
} |
||||
|
||||
static inline uint32_t partial_hsum(__m512i x) { |
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros. Marking this const so the compiler stands |
||||
* a better chance of keeping this resident in a register through entire |
||||
* loop execution. We certainly have enough zmm registers (32) */ |
||||
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, |
||||
1, 1, 1, 1, 1, 1, 1, 1); |
||||
|
||||
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); |
||||
|
||||
/* From here, it's a simple 256 bit wide reduction sum */ |
||||
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero); |
||||
|
||||
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
|
||||
* pretty slow, much slower than the longer instruction sequence below */ |
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), |
||||
_mm256_castsi256_si128(non_zero_avx)); |
||||
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); |
||||
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); |
||||
return (uint32_t)_mm_cvtsi128_si32(sum3); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,225 @@ |
||||
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
|
||||
* Based on Brian Bockelman's AVX2 version |
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Authors: |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* Brian Bockelman <bockelman@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef X86_AVX512VNNI |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
#include "../../cpu_features.h" |
||||
#include <immintrin.h> |
||||
#include "../../adler32_fold.h" |
||||
#include "x86_intrins.h" |
||||
#include "adler32_avx512_p.h" |
||||
#include "adler32_avx2_p.h" |
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { |
||||
if (src == NULL) return 1L; |
||||
if (len == 0) return adler; |
||||
|
||||
uint32_t adler0, adler1; |
||||
adler1 = (adler >> 16) & 0xffff; |
||||
adler0 = adler & 0xffff; |
||||
|
||||
rem_peel: |
||||
if (len < 32) |
||||
#if defined(X86_SSSE3) |
||||
return adler32_ssse3(adler, src, len); |
||||
#else |
||||
return adler32_len_16(adler0, src, len, adler1); |
||||
#endif |
||||
|
||||
if (len < 64) |
||||
#ifdef X86_AVX2 |
||||
return adler32_avx2(adler, src, len); |
||||
#elif defined(X86_SSE3) |
||||
return adler32_ssse3(adler, src, len); |
||||
#else |
||||
return adler32_len_16(adler0, src, len, adler1); |
||||
#endif |
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, |
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, |
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, |
||||
56, 57, 58, 59, 60, 61, 62, 63, 64); |
||||
|
||||
const __m512i zero = _mm512_setzero_si512(); |
||||
__m512i vs1, vs2; |
||||
|
||||
while (len >= 64) { |
||||
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); |
||||
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); |
||||
size_t k = MIN(len, NMAX); |
||||
k -= k % 64; |
||||
len -= k; |
||||
__m512i vs1_0 = vs1; |
||||
__m512i vs3 = _mm512_setzero_si512(); |
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */ |
||||
__m512i vs2_1 = _mm512_setzero_si512(); |
||||
__m512i vbuf0, vbuf1; |
||||
|
||||
/* Remainder peeling */ |
||||
if (k % 128) { |
||||
vbuf1 = _mm512_loadu_si512((__m512i*)src); |
||||
|
||||
src += 64; |
||||
k -= 64; |
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); |
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad); |
||||
vs3 = _mm512_add_epi32(vs3, vs1_0); |
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */ |
||||
while (k >= 128) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) |
||||
*/ |
||||
vbuf0 = _mm512_loadu_si512((__m512i*)src); |
||||
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); |
||||
src += 128; |
||||
k -= 128; |
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); |
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad); |
||||
vs3 = _mm512_add_epi32(vs3, vs1_0); |
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */ |
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); |
||||
|
||||
vs3 = _mm512_add_epi32(vs3, vs1); |
||||
vs1_sad = _mm512_sad_epu8(vbuf1, zero); |
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad); |
||||
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6); |
||||
vs2 = _mm512_add_epi32(vs2, vs3); |
||||
vs2 = _mm512_add_epi32(vs2, vs2_1); |
||||
|
||||
adler0 = partial_hsum(vs1) % BASE; |
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE; |
||||
} |
||||
|
||||
adler = adler0 | (adler1 << 16); |
||||
|
||||
/* Process tail (len < 64). */ |
||||
if (len) { |
||||
goto rem_peel; |
||||
} |
||||
|
||||
return adler; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
if (src == NULL) return 1L; |
||||
if (len == 0) return adler; |
||||
|
||||
uint32_t adler0, adler1; |
||||
adler1 = (adler >> 16) & 0xffff; |
||||
adler0 = adler & 0xffff; |
||||
|
||||
rem_peel_copy: |
||||
if (len < 32) { |
||||
/* This handles the remaining copies, just call normal adler checksum after this */ |
||||
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); |
||||
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); |
||||
_mm256_mask_storeu_epi8(dst, storemask, copy_vec); |
||||
|
||||
#if defined(X86_SSSE3) |
||||
return adler32_ssse3(adler, src, len); |
||||
#else |
||||
return adler32_len_16(adler0, src, len, adler1); |
||||
#endif |
||||
} |
||||
|
||||
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, |
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); |
||||
|
||||
const __m256i zero = _mm256_setzero_si256(); |
||||
__m256i vs1, vs2; |
||||
|
||||
while (len >= 32) { |
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); |
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); |
||||
size_t k = MIN(len, NMAX); |
||||
k -= k % 32; |
||||
len -= k; |
||||
__m256i vs1_0 = vs1; |
||||
__m256i vs3 = _mm256_setzero_si256(); |
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */ |
||||
__m256i vs2_1 = _mm256_setzero_si256(); |
||||
__m256i vbuf0, vbuf1; |
||||
|
||||
/* Remainder peeling */ |
||||
if (k % 64) { |
||||
vbuf1 = _mm256_loadu_si256((__m256i*)src); |
||||
_mm256_storeu_si256((__m256i*)dst, vbuf1); |
||||
dst += 32; |
||||
|
||||
src += 32; |
||||
k -= 32; |
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); |
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad); |
||||
vs3 = _mm256_add_epi32(vs3, vs1_0); |
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */ |
||||
while (k >= 64) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) |
||||
*/ |
||||
vbuf0 = _mm256_loadu_si256((__m256i*)src); |
||||
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); |
||||
_mm256_storeu_si256((__m256i*)dst, vbuf0); |
||||
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); |
||||
dst += 64; |
||||
src += 64; |
||||
k -= 64; |
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); |
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad); |
||||
vs3 = _mm256_add_epi32(vs3, vs1_0); |
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */ |
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); |
||||
|
||||
vs3 = _mm256_add_epi32(vs3, vs1); |
||||
vs1_sad = _mm256_sad_epu8(vbuf1, zero); |
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad); |
||||
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs3 = _mm256_slli_epi32(vs3, 5); |
||||
vs2 = _mm256_add_epi32(vs2, vs3); |
||||
vs2 = _mm256_add_epi32(vs2, vs2_1); |
||||
|
||||
adler0 = partial_hsum256(vs1) % BASE; |
||||
adler1 = hsum256(vs2) % BASE; |
||||
} |
||||
|
||||
adler = adler0 | (adler1 << 16); |
||||
|
||||
/* Process tail (len < 64). */ |
||||
if (len) { |
||||
goto rem_peel_copy; |
||||
} |
||||
|
||||
return adler; |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,121 @@ |
||||
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Authors: |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* Brian Bockelman <bockelman@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
#include "../../adler32_fold.h" |
||||
#include "adler32_ssse3_p.h" |
||||
#include <immintrin.h> |
||||
|
||||
#ifdef X86_SSE42 |
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
uint32_t adler0, adler1; |
||||
adler1 = (adler >> 16) & 0xffff; |
||||
adler0 = adler & 0xffff; |
||||
|
||||
rem_peel: |
||||
if (len < 16) { |
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1); |
||||
} |
||||
|
||||
__m128i vbuf, vbuf_0; |
||||
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, |
||||
v_sad_sum2, vsum2, vsum2_0; |
||||
__m128i zero = _mm_setzero_si128(); |
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); |
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); |
||||
const __m128i dot3v = _mm_set1_epi16(1); |
||||
size_t k; |
||||
|
||||
while (len >= 16) { |
||||
|
||||
k = MIN(len, NMAX); |
||||
k -= k % 16; |
||||
len -= k; |
||||
|
||||
vs1 = _mm_cvtsi32_si128(adler0); |
||||
vs2 = _mm_cvtsi32_si128(adler1); |
||||
|
||||
vs3 = _mm_setzero_si128(); |
||||
vs2_0 = _mm_setzero_si128(); |
||||
vs1_0 = vs1; |
||||
|
||||
while (k >= 32) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) |
||||
*/ |
||||
vbuf = _mm_loadu_si128((__m128i*)src); |
||||
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); |
||||
src += 32; |
||||
k -= 32; |
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero); |
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); |
||||
_mm_storeu_si128((__m128i*)dst, vbuf); |
||||
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); |
||||
dst += 32; |
||||
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); |
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); |
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1); |
||||
vs3 = _mm_add_epi32(vs1_0, vs3); |
||||
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); |
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); |
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1); |
||||
vs2 = _mm_add_epi32(vsum2, vs2); |
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2); |
||||
vs3 = _mm_slli_epi32(vs3, 5); |
||||
vs2 = _mm_add_epi32(vs3, vs2); |
||||
vs3 = _mm_setzero_si128(); |
||||
|
||||
while (k >= 16) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) |
||||
*/ |
||||
vbuf = _mm_loadu_si128((__m128i*)src); |
||||
src += 16; |
||||
k -= 16; |
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero); |
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); |
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1); |
||||
vs3 = _mm_add_epi32(vs1_0, vs3); |
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); |
||||
vs2 = _mm_add_epi32(vsum2, vs2); |
||||
vs1_0 = vs1; |
||||
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf); |
||||
dst += 16; |
||||
} |
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4); |
||||
vs2 = _mm_add_epi32(vs2, vs3); |
||||
|
||||
adler0 = partial_hsum(vs1) % BASE; |
||||
adler1 = hsum(vs2) % BASE; |
||||
} |
||||
|
||||
/* If this is true, there's fewer than 16 elements remaining */ |
||||
if (len) { |
||||
goto rem_peel; |
||||
} |
||||
|
||||
return adler0 | (adler1 << 16); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,156 @@ |
||||
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler |
||||
* Authors: |
||||
* Adam Stylinski <kungfujesus06@gmail.com> |
||||
* Brian Bockelman <bockelman@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "../../adler32_p.h" |
||||
#include "adler32_ssse3_p.h" |
||||
|
||||
#ifdef X86_SSSE3 |
||||
|
||||
#include <immintrin.h> |
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { |
||||
uint32_t sum2; |
||||
|
||||
/* split Adler-32 into component sums */ |
||||
sum2 = (adler >> 16) & 0xffff; |
||||
adler &= 0xffff; |
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */ |
||||
if (UNLIKELY(len == 1)) |
||||
return adler32_len_1(adler, buf, sum2); |
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */ |
||||
if (UNLIKELY(buf == NULL)) |
||||
return 1L; |
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */ |
||||
if (UNLIKELY(len < 16)) |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); |
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); |
||||
const __m128i dot3v = _mm_set1_epi16(1); |
||||
const __m128i zero = _mm_setzero_si128(); |
||||
|
||||
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, |
||||
vbuf_0, v_sad_sum2, vsum2, vsum2_0; |
||||
|
||||
/* If our buffer is unaligned (likely), make the determination whether
|
||||
* or not there's enough of a buffer to consume to make the scalar, aligning |
||||
* additions worthwhile or if it's worth it to just eat the cost of an unaligned |
||||
* load. This is a pretty simple test, just test if 16 - the remainder + len is |
||||
* < 16 */ |
||||
size_t max_iters = NMAX; |
||||
size_t rem = (uintptr_t)buf & 15; |
||||
size_t align_offset = 16 - rem; |
||||
size_t k = 0; |
||||
if (rem) { |
||||
if (len < 16 + align_offset) { |
||||
/* Let's eat the cost of this one unaligned load so that
|
||||
* we don't completely skip over the vectorization. Doing |
||||
* 16 bytes at a time unaligned is better than 16 + <= 15 |
||||
* sums */ |
||||
vbuf = _mm_loadu_si128((__m128i*)buf); |
||||
len -= 16; |
||||
buf += 16; |
||||
vs1 = _mm_cvtsi32_si128(adler); |
||||
vs2 = _mm_cvtsi32_si128(sum2); |
||||
vs3 = _mm_setzero_si128(); |
||||
vs1_0 = vs1; |
||||
goto unaligned_jmp; |
||||
} |
||||
|
||||
for (size_t i = 0; i < align_offset; ++i) { |
||||
adler += *(buf++); |
||||
sum2 += adler; |
||||
} |
||||
|
||||
/* lop off the max number of sums based on the scalar sums done
|
||||
* above */ |
||||
len -= align_offset; |
||||
max_iters -= align_offset; |
||||
} |
||||
|
||||
|
||||
while (len >= 16) { |
||||
vs1 = _mm_cvtsi32_si128(adler); |
||||
vs2 = _mm_cvtsi32_si128(sum2); |
||||
vs3 = _mm_setzero_si128(); |
||||
vs2_0 = _mm_setzero_si128(); |
||||
vs1_0 = vs1; |
||||
|
||||
k = (len < max_iters ? len : max_iters); |
||||
k -= k % 16; |
||||
len -= k; |
||||
|
||||
while (k >= 32) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) |
||||
*/ |
||||
vbuf = _mm_load_si128((__m128i*)buf); |
||||
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); |
||||
buf += 32; |
||||
k -= 32; |
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero); |
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); |
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1); |
||||
vs3 = _mm_add_epi32(vs1_0, vs3); |
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1); |
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); |
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); |
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); |
||||
vs2 = _mm_add_epi32(vsum2, vs2); |
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); |
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2); |
||||
vs3 = _mm_slli_epi32(vs3, 5); |
||||
vs2 = _mm_add_epi32(vs3, vs2); |
||||
vs3 = _mm_setzero_si128(); |
||||
|
||||
while (k >= 16) { |
||||
/*
|
||||
vs1 = adler + sum(c[i]) |
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) |
||||
*/ |
||||
vbuf = _mm_load_si128((__m128i*)buf); |
||||
buf += 16; |
||||
k -= 16; |
||||
|
||||
unaligned_jmp: |
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero); |
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1); |
||||
vs3 = _mm_add_epi32(vs1_0, vs3); |
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); |
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); |
||||
vs2 = _mm_add_epi32(vsum2, vs2); |
||||
vs1_0 = vs1; |
||||
} |
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4); |
||||
vs2 = _mm_add_epi32(vs2, vs3); |
||||
|
||||
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
|
||||
* a partial reduction sum implicitly and only summing to integers in vector positions |
||||
* 0 and 2. This saves us some contention on the shuffle port(s) */ |
||||
adler = partial_hsum(vs1) % BASE; |
||||
sum2 = hsum(vs2) % BASE; |
||||
max_iters = NMAX; |
||||
} |
||||
|
||||
/* Process tail (len < 16). */ |
||||
return adler32_len_16(adler, buf, len, sum2); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,29 @@ |
||||
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef ADLER32_SSSE3_P_H_ |
||||
#define ADLER32_SSSE3_P_H_ |
||||
|
||||
#ifdef X86_SSSE3 |
||||
|
||||
#include <immintrin.h> |
||||
#include <stdint.h> |
||||
|
||||
static inline uint32_t partial_hsum(__m128i x) { |
||||
__m128i second_int = _mm_srli_si128(x, 8); |
||||
__m128i sum = _mm_add_epi32(x, second_int); |
||||
return _mm_cvtsi128_si32(sum); |
||||
} |
||||
|
||||
static inline uint32_t hsum(__m128i x) { |
||||
__m128i sum1 = _mm_unpackhi_epi64(x, x); |
||||
__m128i sum2 = _mm_add_epi32(x, sum1); |
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); |
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3); |
||||
return _mm_cvtsi128_si32(sum4); |
||||
} |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,133 @@ |
||||
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#include "zbuild.h" |
||||
|
||||
#ifdef X86_AVX2 |
||||
#include <immintrin.h> |
||||
#include "../generic/chunk_permute_table.h" |
||||
|
||||
typedef __m256i chunk_t; |
||||
|
||||
#define CHUNK_SIZE 32 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
#define HAVE_CHUNK_MAG |
||||
|
||||
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
|
||||
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ |
||||
static const lut_rem_pair perm_idx_lut[29] = { |
||||
{ 0, 2}, /* 3 */ |
||||
{ 0, 0}, /* don't care */ |
||||
{ 1 * 32, 2}, /* 5 */ |
||||
{ 2 * 32, 2}, /* 6 */ |
||||
{ 3 * 32, 4}, /* 7 */ |
||||
{ 0 * 32, 0}, /* don't care */ |
||||
{ 4 * 32, 5}, /* 9 */ |
||||
{ 5 * 32, 22}, /* 10 */ |
||||
{ 6 * 32, 21}, /* 11 */ |
||||
{ 7 * 32, 20}, /* 12 */ |
||||
{ 8 * 32, 6}, /* 13 */ |
||||
{ 9 * 32, 4}, /* 14 */ |
||||
{10 * 32, 2}, /* 15 */ |
||||
{ 0 * 32, 0}, /* don't care */ |
||||
{11 * 32, 15}, /* 17 */ |
||||
{11 * 32 + 16, 14}, /* 18 */ |
||||
{11 * 32 + 16 * 2, 13}, /* 19 */ |
||||
{11 * 32 + 16 * 3, 12}, /* 20 */ |
||||
{11 * 32 + 16 * 4, 11}, /* 21 */ |
||||
{11 * 32 + 16 * 5, 10}, /* 22 */ |
||||
{11 * 32 + 16 * 6, 9}, /* 23 */ |
||||
{11 * 32 + 16 * 7, 8}, /* 24 */ |
||||
{11 * 32 + 16 * 8, 7}, /* 25 */ |
||||
{11 * 32 + 16 * 9, 6}, /* 26 */ |
||||
{11 * 32 + 16 * 10, 5}, /* 27 */ |
||||
{11 * 32 + 16 * 11, 4}, /* 28 */ |
||||
{11 * 32 + 16 * 12, 3}, /* 29 */ |
||||
{11 * 32 + 16 * 13, 2}, /* 30 */ |
||||
{11 * 32 + 16 * 14, 1} /* 31 */ |
||||
}; |
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
int16_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm256_set1_epi16(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
int32_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm256_set1_epi32(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
int64_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm256_set1_epi64x(tmp); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
*chunk = _mm256_loadu_si256((__m256i *)s); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
_mm256_storeu_si256((__m256i *)out, *chunk); |
||||
} |
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { |
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; |
||||
__m256i ret_vec; |
||||
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
|
||||
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in |
||||
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ |
||||
*chunk_rem = lut_rem.remval; |
||||
|
||||
/* See note in chunkset_ssse3.c for why this is ok */ |
||||
__msan_unpoison(buf + dist, 32 - dist); |
||||
|
||||
if (dist < 16) { |
||||
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
|
||||
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate |
||||
* shuffles and combining the halves later */ |
||||
const __m256i permute_xform = |
||||
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); |
||||
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); |
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); |
||||
perm_vec = _mm256_add_epi8(perm_vec, permute_xform); |
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); |
||||
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); |
||||
} else if (dist == 16) { |
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); |
||||
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); |
||||
} else { |
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); |
||||
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); |
||||
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ |
||||
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); |
||||
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1); |
||||
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1); |
||||
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
|
||||
* shuffle those values */ |
||||
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes); |
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); |
||||
} |
||||
|
||||
return ret_vec; |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_avx2 |
||||
#define CHUNKCOPY chunkcopy_avx2 |
||||
#define CHUNKUNROLL chunkunroll_avx2 |
||||
#define CHUNKMEMSET chunkmemset_avx2 |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2 |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_avx2 |
||||
|
||||
#include "inffast_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,56 @@ |
||||
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
|
||||
#ifdef X86_SSE2 |
||||
#include <immintrin.h> |
||||
|
||||
typedef __m128i chunk_t; |
||||
|
||||
#define CHUNK_SIZE 16 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
int16_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi16(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
int32_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi32(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
int64_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi64x(tmp); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
*chunk = _mm_loadu_si128((__m128i *)s); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
_mm_storeu_si128((__m128i *)out, *chunk); |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_sse2 |
||||
#define CHUNKCOPY chunkcopy_sse2 |
||||
#define CHUNKUNROLL chunkunroll_sse2 |
||||
#define CHUNKMEMSET chunkmemset_sse2 |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2 |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_sse2 |
||||
|
||||
#include "inffast_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,101 @@ |
||||
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
|
||||
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
|
||||
* code size by sharing the chunkcopy functions, which will certainly compile |
||||
* to identical machine code */ |
||||
#if defined(X86_SSSE3) && defined(X86_SSE2) |
||||
#include <immintrin.h> |
||||
#include "../generic/chunk_permute_table.h" |
||||
|
||||
typedef __m128i chunk_t; |
||||
|
||||
#define CHUNK_SIZE 16 |
||||
|
||||
#define HAVE_CHUNKMEMSET_2 |
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
#define HAVE_CHUNK_MAG |
||||
#define HAVE_CHUNKCOPY |
||||
#define HAVE_CHUNKUNROLL |
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = { |
||||
{0, 1}, /* 3 */ |
||||
{0, 0}, /* don't care */ |
||||
{1 * 32, 1}, /* 5 */ |
||||
{2 * 32, 4}, /* 6 */ |
||||
{3 * 32, 2}, /* 7 */ |
||||
{0 * 32, 0}, /* don't care */ |
||||
{4 * 32, 7}, /* 9 */ |
||||
{5 * 32, 6}, /* 10 */ |
||||
{6 * 32, 5}, /* 11 */ |
||||
{7 * 32, 4}, /* 12 */ |
||||
{8 * 32, 3}, /* 13 */ |
||||
{9 * 32, 2}, /* 14 */ |
||||
{10 * 32, 1},/* 15 */ |
||||
}; |
||||
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
||||
int16_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi16(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
int32_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi32(tmp); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
int64_t tmp; |
||||
memcpy(&tmp, from, sizeof(tmp)); |
||||
*chunk = _mm_set1_epi64x(tmp); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
*chunk = _mm_loadu_si128((__m128i *)s); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
_mm_storeu_si128((__m128i *)out, *chunk); |
||||
} |
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { |
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; |
||||
__m128i perm_vec, ret_vec; |
||||
/* Important to note:
|
||||
* This is _not_ to subvert the memory sanitizer but to instead unpoison some |
||||
* bytes we willingly and purposefully load uninitialized that we swizzle over |
||||
* in a vector register, anyway. If what we assume is wrong about what is used, |
||||
* the memory sanitizer will still usefully flag it */ |
||||
__msan_unpoison(buf + dist, 16 - dist); |
||||
ret_vec = _mm_loadu_si128((__m128i*)buf); |
||||
*chunk_rem = lut_rem.remval; |
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); |
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); |
||||
|
||||
return ret_vec; |
||||
} |
||||
|
||||
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); |
||||
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); |
||||
|
||||
#define CHUNKSIZE chunksize_ssse3 |
||||
#define CHUNKMEMSET chunkmemset_ssse3 |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3 |
||||
#define CHUNKCOPY chunkcopy_sse2 |
||||
#define CHUNKUNROLL chunkunroll_sse2 |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_ssse3 |
||||
|
||||
#include "inffast_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,63 @@ |
||||
/* compare256_avx2.c -- AVX2 version of compare256
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
|
||||
#include "fallback_builtins.h" |
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
||||
|
||||
#include <immintrin.h> |
||||
#ifdef _MSC_VER |
||||
# include <nmmintrin.h> |
||||
#endif |
||||
|
||||
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
__m256i ymm_src0, ymm_src1, ymm_cmp; |
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ |
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
||||
if (mask != 0xFFFFFFFF) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */ |
||||
return len + match_byte; |
||||
} |
||||
|
||||
src0 += 32, src1 += 32, len += 32; |
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); |
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
||||
if (mask != 0xFFFFFFFF) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
||||
return len + match_byte; |
||||
} |
||||
|
||||
src0 += 32, src1 += 32, len += 32; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_avx2_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_avx2 |
||||
#define COMPARE256 compare256_avx2_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_avx2 |
||||
#define COMPARE256 compare256_avx2_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,96 @@ |
||||
/* compare256_sse2.c -- SSE2 version of compare256
|
||||
* Copyright Adam Stylinski <kungfujesus06@gmail.com> |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
|
||||
#include "fallback_builtins.h" |
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) |
||||
|
||||
#include <emmintrin.h> |
||||
|
||||
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
int align_offset = ((uintptr_t)src0) & 15; |
||||
const uint8_t *end0 = src0 + 256; |
||||
const uint8_t *end1 = src1 + 256; |
||||
__m128i xmm_src0, xmm_src1, xmm_cmp; |
||||
|
||||
/* Do the first load unaligned, than all subsequent ones we have at least
|
||||
* one aligned load. Sadly aligning both loads is probably unrealistic */ |
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0); |
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
||||
|
||||
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */ |
||||
if (mask != 0xFFFF) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
||||
return len + match_byte; |
||||
} |
||||
|
||||
int align_adv = 16 - align_offset; |
||||
len += align_adv; |
||||
src0 += align_adv; |
||||
src1 += align_adv; |
||||
|
||||
/* Do a flooring division (should just be a shift right) */ |
||||
int num_iter = (256 - len) / 16; |
||||
|
||||
for (int i = 0; i < num_iter; ++i) { |
||||
xmm_src0 = _mm_load_si128((__m128i*)src0); |
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */ |
||||
if (mask != 0xFFFF) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
||||
return len + match_byte; |
||||
} |
||||
|
||||
len += 16, src0 += 16, src1 += 16; |
||||
} |
||||
|
||||
if (align_offset) { |
||||
src0 = end0 - 16; |
||||
src1 = end1 - 16; |
||||
len = 256 - 16; |
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0); |
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
||||
|
||||
if (mask != 0xFFFF) { |
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
||||
return len + match_byte; |
||||
} |
||||
} |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_sse2_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_sse2 |
||||
#define COMPARE256 compare256_sse2_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_sse2 |
||||
#define COMPARE256 compare256_sse2_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,186 @@ |
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ |
||||
* instruction. |
||||
* |
||||
* A white paper describing this algorithm can be found at: |
||||
* doc/crc-pclmulqdq.pdf |
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Copyright (C) 2016 Marian Beermann (support for initial value) |
||||
* Authors: |
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com> |
||||
* Jim Guilford <james.guilford@intel.com> |
||||
* Vinodh Gopal <vinodh.gopal@intel.com> |
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef COPY |
||||
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
#else |
||||
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { |
||||
#endif |
||||
unsigned long algn_diff; |
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; |
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; |
||||
__m128i xmm_crc_part = _mm_setzero_si128(); |
||||
#ifdef COPY |
||||
char ALIGNED_(16) partial_buf[16] = { 0 }; |
||||
#else |
||||
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc); |
||||
int32_t first = init_crc != 0; |
||||
|
||||
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
|
||||
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to |
||||
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which |
||||
* by definition can be up to 15 bytes + one full vector load. */ |
||||
assert(len >= 31 || first == 0); |
||||
#endif |
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
|
||||
if (len < 16) { |
||||
#ifdef COPY |
||||
if (len == 0) |
||||
return; |
||||
|
||||
memcpy(partial_buf, src, len); |
||||
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); |
||||
memcpy(dst, partial_buf, len); |
||||
#endif |
||||
goto partial; |
||||
} |
||||
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; |
||||
if (algn_diff) { |
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src); |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part); |
||||
dst += algn_diff; |
||||
#else |
||||
XOR_INITIAL128(xmm_crc_part); |
||||
|
||||
if (algn_diff < 4 && init_crc != 0) { |
||||
xmm_t0 = xmm_crc_part; |
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); |
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
||||
src += 16; |
||||
len -= 16; |
||||
} |
||||
#endif |
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); |
||||
|
||||
src += algn_diff; |
||||
len -= algn_diff; |
||||
} |
||||
|
||||
#ifdef X86_VPCLMULQDQ |
||||
if (len >= 256) { |
||||
#ifdef COPY |
||||
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); |
||||
dst += n; |
||||
#else |
||||
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, |
||||
xmm_initial, first); |
||||
first = 0; |
||||
#endif |
||||
len -= n; |
||||
src += n; |
||||
} |
||||
#endif |
||||
|
||||
while (len >= 64) { |
||||
len -= 64; |
||||
xmm_t0 = _mm_load_si128((__m128i *)src); |
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3); |
||||
src += 64; |
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0); |
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
||||
dst += 64; |
||||
#else |
||||
XOR_INITIAL128(xmm_t0); |
||||
#endif |
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); |
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); |
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); |
||||
} |
||||
|
||||
/*
|
||||
* len = num bytes left - 64 |
||||
*/ |
||||
if (len >= 48) { |
||||
len -= 48; |
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src); |
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
||||
src += 48; |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0); |
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
||||
dst += 48; |
||||
#else |
||||
XOR_INITIAL128(xmm_t0); |
||||
#endif |
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); |
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); |
||||
} else if (len >= 32) { |
||||
len -= 32; |
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src); |
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
||||
src += 32; |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0); |
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
||||
dst += 32; |
||||
#else |
||||
XOR_INITIAL128(xmm_t0); |
||||
#endif |
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); |
||||
} else if (len >= 16) { |
||||
len -= 16; |
||||
xmm_t0 = _mm_load_si128((__m128i *)src); |
||||
src += 16; |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0); |
||||
dst += 16; |
||||
#else |
||||
XOR_INITIAL128(xmm_t0); |
||||
#endif |
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
||||
} |
||||
|
||||
partial: |
||||
if (len) { |
||||
memcpy(&xmm_crc_part, src, len); |
||||
#ifdef COPY |
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); |
||||
memcpy(dst, partial_buf, len); |
||||
#endif |
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); |
||||
} |
||||
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
} |
@ -0,0 +1,107 @@ |
||||
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com) |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef COPY |
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, |
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
#else |
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, |
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, |
||||
__m128i init_crc, int32_t first) { |
||||
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc); |
||||
#endif |
||||
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; |
||||
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; |
||||
__m512i z0, z1, z2, z3; |
||||
size_t len_tmp = len; |
||||
const __m512i zmm_fold4 = _mm512_set4_epi32( |
||||
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
||||
const __m512i zmm_fold16 = _mm512_set4_epi32( |
||||
0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
||||
|
||||
// zmm register init
|
||||
zmm_crc0 = _mm512_setzero_si512(); |
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
||||
#ifndef COPY |
||||
XOR_INITIAL512(zmm_t0); |
||||
#endif |
||||
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); |
||||
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); |
||||
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); |
||||
|
||||
/* already have intermediate CRC in xmm registers
|
||||
* fold4 with 4 xmm_crc to get zmm_crc0 |
||||
*/ |
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); |
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); |
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); |
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); |
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
||||
|
||||
#ifdef COPY |
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0); |
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); |
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); |
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); |
||||
dst += 256; |
||||
#endif |
||||
len -= 256; |
||||
src += 256; |
||||
|
||||
// fold-16 loops
|
||||
while (len >= 256) { |
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
||||
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); |
||||
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); |
||||
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); |
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); |
||||
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); |
||||
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); |
||||
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); |
||||
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); |
||||
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); |
||||
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); |
||||
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); |
||||
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
||||
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96); |
||||
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96); |
||||
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96); |
||||
|
||||
#ifdef COPY |
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0); |
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); |
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); |
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); |
||||
dst += 256; |
||||
#endif |
||||
len -= 256; |
||||
src += 256; |
||||
} |
||||
// zmm_crc[0,1,2,3] -> zmm_crc0
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96); |
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96); |
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96); |
||||
|
||||
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
|
||||
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); |
||||
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); |
||||
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); |
||||
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); |
||||
|
||||
return (len_tmp - len); // return n bytes processed
|
||||
} |
@ -0,0 +1,30 @@ |
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ |
||||
* instruction. |
||||
* |
||||
* A white paper describing this algorithm can be found at: |
||||
* doc/crc-pclmulqdq.pdf |
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Copyright (C) 2016 Marian Beermann (support for initial value) |
||||
* Authors: |
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com> |
||||
* Jim Guilford <james.guilford@intel.com> |
||||
* Vinodh Gopal <vinodh.gopal@intel.com> |
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC |
||||
|
||||
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy |
||||
#define CRC32_FOLD crc32_fold_pclmulqdq |
||||
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset |
||||
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final |
||||
#define CRC32 crc32_pclmulqdq |
||||
|
||||
#include "crc32_pclmulqdq_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,363 @@ |
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ |
||||
* instruction. |
||||
* |
||||
* A white paper describing this algorithm can be found at: |
||||
* doc/crc-pclmulqdq.pdf |
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Copyright (C) 2016 Marian Beermann (support for initial value) |
||||
* Authors: |
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com> |
||||
* Jim Guilford <james.guilford@intel.com> |
||||
* Vinodh Gopal <vinodh.gopal@intel.com> |
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
|
||||
#include <immintrin.h> |
||||
#include <wmmintrin.h> |
||||
#include <smmintrin.h> // _mm_extract_epi32 |
||||
#ifdef X86_VPCLMULQDQ |
||||
# include <immintrin.h> |
||||
#endif |
||||
|
||||
#include "../../crc32_fold.h" |
||||
#include "../../crc32_braid_p.h" |
||||
#include "x86_intrins.h" |
||||
#include <assert.h> |
||||
|
||||
#ifdef X86_VPCLMULQDQ |
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, |
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc, |
||||
int32_t first); |
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, |
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
|
||||
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, |
||||
0x00000001, 0xc6e41596); |
||||
__m128i x_tmp3; |
||||
__m128 ps_crc0, ps_crc3, ps_res; |
||||
|
||||
x_tmp3 = *xmm_crc3; |
||||
|
||||
*xmm_crc3 = *xmm_crc0; |
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); |
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0); |
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3); |
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3); |
||||
|
||||
*xmm_crc0 = *xmm_crc1; |
||||
*xmm_crc1 = *xmm_crc2; |
||||
*xmm_crc2 = x_tmp3; |
||||
*xmm_crc3 = _mm_castps_si128(ps_res); |
||||
} |
||||
|
||||
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, |
||||
0x00000001, 0xc6e41596); |
||||
__m128i x_tmp3, x_tmp2; |
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20; |
||||
|
||||
x_tmp3 = *xmm_crc3; |
||||
x_tmp2 = *xmm_crc2; |
||||
|
||||
*xmm_crc3 = *xmm_crc1; |
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); |
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3); |
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1); |
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1); |
||||
|
||||
*xmm_crc2 = *xmm_crc0; |
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0); |
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2); |
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2); |
||||
|
||||
*xmm_crc0 = x_tmp2; |
||||
*xmm_crc1 = x_tmp3; |
||||
*xmm_crc2 = _mm_castps_si128(ps_res20); |
||||
*xmm_crc3 = _mm_castps_si128(ps_res31); |
||||
} |
||||
|
||||
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, |
||||
0x00000001, 0xc6e41596); |
||||
__m128i x_tmp3; |
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10; |
||||
|
||||
x_tmp3 = *xmm_crc3; |
||||
|
||||
*xmm_crc3 = *xmm_crc2; |
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); |
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2); |
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3); |
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3); |
||||
|
||||
*xmm_crc2 = *xmm_crc1; |
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1); |
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2); |
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2); |
||||
|
||||
*xmm_crc1 = *xmm_crc0; |
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0); |
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1); |
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1); |
||||
|
||||
*xmm_crc0 = x_tmp3; |
||||
*xmm_crc1 = _mm_castps_si128(ps_res10); |
||||
*xmm_crc2 = _mm_castps_si128(ps_res21); |
||||
*xmm_crc3 = _mm_castps_si128(ps_res32); |
||||
} |
||||
|
||||
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, |
||||
0x00000001, 0xc6e41596); |
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3; |
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3; |
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3; |
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3; |
||||
|
||||
x_tmp0 = *xmm_crc0; |
||||
x_tmp1 = *xmm_crc1; |
||||
x_tmp2 = *xmm_crc2; |
||||
x_tmp3 = *xmm_crc3; |
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10); |
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0); |
||||
ps_t0 = _mm_castsi128_ps(x_tmp0); |
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0); |
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10); |
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1); |
||||
ps_t1 = _mm_castsi128_ps(x_tmp1); |
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1); |
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10); |
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2); |
||||
ps_t2 = _mm_castsi128_ps(x_tmp2); |
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2); |
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); |
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10); |
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3); |
||||
ps_t3 = _mm_castsi128_ps(x_tmp3); |
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3); |
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0); |
||||
*xmm_crc1 = _mm_castps_si128(ps_res1); |
||||
*xmm_crc2 = _mm_castps_si128(ps_res2); |
||||
*xmm_crc3 = _mm_castps_si128(ps_res3); |
||||
} |
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = { |
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ |
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */ |
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */ |
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */ |
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */ |
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */ |
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */ |
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */ |
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */ |
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/ |
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/ |
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/ |
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/ |
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/ |
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/ |
||||
}; |
||||
|
||||
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, |
||||
__m128i *xmm_crc3, __m128i *xmm_crc_part) { |
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, |
||||
0x00000001, 0xc6e41596); |
||||
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); |
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3; |
||||
__m128i xmm_a0_0, xmm_a0_1; |
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res; |
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1)))); |
||||
xmm_shr = xmm_shl; |
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3); |
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl); |
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr); |
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl); |
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1); |
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr); |
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl); |
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2); |
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr); |
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl); |
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3); |
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr); |
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl); |
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part); |
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10); |
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01); |
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3); |
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0); |
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1); |
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0); |
||||
ps_res = _mm_xor_ps(ps_res, psa0_1); |
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res); |
||||
} |
||||
|
||||
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) { |
||||
*fold0 = _mm_load_si128(fold + 0); |
||||
*fold1 = _mm_load_si128(fold + 1); |
||||
*fold2 = _mm_load_si128(fold + 2); |
||||
*fold3 = _mm_load_si128(fold + 3); |
||||
} |
||||
|
||||
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1, |
||||
const __m128i *fold2, const __m128i *fold3) { |
||||
_mm_storeu_si128(fold + 0, *fold0); |
||||
_mm_storeu_si128(fold + 1, *fold1); |
||||
_mm_storeu_si128(fold + 2, *fold2); |
||||
_mm_storeu_si128(fold + 3, *fold3); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) { |
||||
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); |
||||
__m128i xmm_zero = _mm_setzero_si128(); |
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero); |
||||
return 0; |
||||
} |
||||
|
||||
#define ONCE(op) if (first) { first = 0; op; } |
||||
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) |
||||
#ifdef X86_VPCLMULQDQ |
||||
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) |
||||
#endif |
||||
|
||||
#ifdef X86_VPCLMULQDQ |
||||
# include "crc32_fold_vpclmulqdq_tpl.h" |
||||
#endif |
||||
#include "crc32_fold_pclmulqdq_tpl.h" |
||||
#define COPY |
||||
#ifdef X86_VPCLMULQDQ |
||||
# include "crc32_fold_vpclmulqdq_tpl.h" |
||||
#endif |
||||
#include "crc32_fold_pclmulqdq_tpl.h" |
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = { |
||||
0xccaa009e, 0x00000000, /* rk1 */ |
||||
0x751997d0, 0x00000001, /* rk2 */ |
||||
0xccaa009e, 0x00000000, /* rk5 */ |
||||
0x63cd6124, 0x00000001, /* rk6 */ |
||||
0xf7011640, 0x00000001, /* rk7 */ |
||||
0xdb710640, 0x00000001 /* rk8 */ |
||||
}; |
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = { |
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 |
||||
}; |
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = { |
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF |
||||
}; |
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) { |
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); |
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); |
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; |
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; |
||||
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
||||
|
||||
/*
|
||||
* k1 |
||||
*/ |
||||
crc_fold = _mm_load_si128((__m128i *)crc_k); |
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10); |
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01); |
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0); |
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0); |
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10); |
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01); |
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1); |
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1); |
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10); |
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); |
||||
|
||||
/*
|
||||
* k5 |
||||
*/ |
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4)); |
||||
|
||||
xmm_crc0 = xmm_crc3; |
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); |
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); |
||||
|
||||
xmm_crc0 = xmm_crc3; |
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4); |
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); |
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2); |
||||
|
||||
/*
|
||||
* k7 |
||||
*/ |
||||
xmm_crc1 = xmm_crc3; |
||||
xmm_crc2 = xmm_crc3; |
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8)); |
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); |
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask); |
||||
|
||||
xmm_crc2 = xmm_crc3; |
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); |
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1); |
||||
|
||||
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2)); |
||||
|
||||
return crc->value; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) { |
||||
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
|
||||
* these short lengths might also prove to be effective */ |
||||
if (len < 64) |
||||
return PREFIX(crc32_braid)(crc32, buf, len); |
||||
|
||||
crc32_fold ALIGNED_(16) crc_state; |
||||
CRC32_FOLD_RESET(&crc_state); |
||||
CRC32_FOLD(&crc_state, buf, len, crc32); |
||||
return CRC32_FOLD_FINAL(&crc_state); |
||||
} |
@ -0,0 +1,17 @@ |
||||
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com) |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) |
||||
|
||||
#define X86_VPCLMULQDQ |
||||
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy |
||||
#define CRC32_FOLD crc32_fold_vpclmulqdq |
||||
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset |
||||
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final |
||||
#define CRC32 crc32_vpclmulqdq |
||||
|
||||
#include "crc32_pclmulqdq_tpl.h" |
||||
|
||||
#endif |
@ -0,0 +1,24 @@ |
||||
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
*/ |
||||
|
||||
#ifdef X86_SSE42 |
||||
#include "../../zbuild.h" |
||||
#include <nmmintrin.h> |
||||
#include "../../deflate.h" |
||||
|
||||
#define HASH_CALC(s, h, val)\ |
||||
h = _mm_crc32_u32(h, val) |
||||
|
||||
#define HASH_CALC_VAR h |
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0 |
||||
|
||||
#define UPDATE_HASH update_hash_sse42 |
||||
#define INSERT_STRING insert_string_sse42 |
||||
#define QUICK_INSERT_STRING quick_insert_string_sse42 |
||||
|
||||
#include "../../insert_string_tpl.h" |
||||
#endif |
@ -0,0 +1,39 @@ |
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation |
||||
* |
||||
* Copyright (C) 2017 Intel Corporation |
||||
* Authors: |
||||
* Arjan van de Ven <arjan@linux.intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* Mika T. Lindqvist <postmaster@raasu.org> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
#include <immintrin.h> |
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { |
||||
table += entries; |
||||
table -= 16; |
||||
|
||||
do { |
||||
__m256i value, result; |
||||
|
||||
value = _mm256_loadu_si256((__m256i *)table); |
||||
result = _mm256_subs_epu16(value, wsize); |
||||
_mm256_storeu_si256((__m256i *)table, result); |
||||
|
||||
table -= 16; |
||||
entries -= 16; |
||||
} while (entries > 0); |
||||
} |
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) { |
||||
uint16_t wsize = (uint16_t)s->w_size; |
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); |
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); |
||||
slide_hash_chain(s->prev, wsize, ymm_wsize); |
||||
} |
@ -0,0 +1,62 @@ |
||||
/*
|
||||
* SSE optimized hash slide |
||||
* |
||||
* Copyright (C) 2017 Intel Corporation |
||||
* Authors: |
||||
* Arjan van de Ven <arjan@linux.intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#include "../../zbuild.h" |
||||
#include "../../deflate.h" |
||||
|
||||
#include <immintrin.h> |
||||
#include <assert.h> |
||||
|
||||
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0, |
||||
uint32_t entries1, const __m128i wsize) { |
||||
uint32_t entries; |
||||
Pos *table; |
||||
__m128i value0, value1, result0, result1; |
||||
|
||||
int on_chain = 0; |
||||
|
||||
next_chain: |
||||
table = (on_chain) ? table1 : table0; |
||||
entries = (on_chain) ? entries1 : entries0; |
||||
|
||||
table += entries; |
||||
table -= 16; |
||||
|
||||
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
|
||||
* Our alloc function is aligned to 64 byte boundaries */ |
||||
do { |
||||
value0 = _mm_load_si128((__m128i *)table); |
||||
value1 = _mm_load_si128((__m128i *)(table + 8)); |
||||
result0 = _mm_subs_epu16(value0, wsize); |
||||
result1 = _mm_subs_epu16(value1, wsize); |
||||
_mm_store_si128((__m128i *)table, result0); |
||||
_mm_store_si128((__m128i *)(table + 8), result1); |
||||
|
||||
table -= 16; |
||||
entries -= 16; |
||||
} while (entries > 0); |
||||
|
||||
++on_chain; |
||||
if (on_chain > 1) { |
||||
return; |
||||
} else { |
||||
goto next_chain; |
||||
} |
||||
} |
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) { |
||||
uint16_t wsize = (uint16_t)s->w_size; |
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); |
||||
|
||||
assert(((uintptr_t)s->head & 15) == 0); |
||||
assert(((uintptr_t)s->prev & 15) == 0); |
||||
|
||||
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize); |
||||
} |
@ -0,0 +1,97 @@ |
||||
/* x86_features.c - x86 feature check
|
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Author: |
||||
* Jim Kukunas |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "../../zbuild.h" |
||||
#include "x86_features.h" |
||||
|
||||
#ifdef _MSC_VER |
||||
# include <intrin.h> |
||||
#else |
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
# include <cpuid.h> |
||||
#endif |
||||
|
||||
#include <string.h> |
||||
|
||||
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { |
||||
#ifdef _MSC_VER |
||||
unsigned int registers[4]; |
||||
__cpuid((int *)registers, info); |
||||
|
||||
*eax = registers[0]; |
||||
*ebx = registers[1]; |
||||
*ecx = registers[2]; |
||||
*edx = registers[3]; |
||||
#else |
||||
__cpuid(info, *eax, *ebx, *ecx, *edx); |
||||
#endif |
||||
} |
||||
|
||||
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { |
||||
#ifdef _MSC_VER |
||||
unsigned int registers[4]; |
||||
__cpuidex((int *)registers, info, subinfo); |
||||
|
||||
*eax = registers[0]; |
||||
*ebx = registers[1]; |
||||
*ecx = registers[2]; |
||||
*edx = registers[3]; |
||||
#else |
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx); |
||||
#endif |
||||
} |
||||
|
||||
static inline uint64_t xgetbv(unsigned int xcr) { |
||||
#ifdef _MSC_VER |
||||
return _xgetbv(xcr); |
||||
#else |
||||
uint32_t eax, edx; |
||||
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr)); |
||||
return (uint64_t)(edx) << 32 | eax; |
||||
#endif |
||||
} |
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { |
||||
unsigned eax, ebx, ecx, edx; |
||||
unsigned maxbasic; |
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx); |
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); |
||||
|
||||
features->has_sse2 = edx & 0x4000000; |
||||
features->has_ssse3 = ecx & 0x200; |
||||
features->has_sse42 = ecx & 0x100000; |
||||
features->has_pclmulqdq = ecx & 0x2; |
||||
|
||||
if (ecx & 0x08000000) { |
||||
uint64_t xfeature = xgetbv(0); |
||||
|
||||
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06); |
||||
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6); |
||||
} |
||||
|
||||
if (maxbasic >= 7) { |
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx); |
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
features->has_vpclmulqdq = ecx & 0x400; |
||||
|
||||
// check AVX2 bit if the OS supports saving YMM registers
|
||||
if (features->has_os_save_ymm) { |
||||
features->has_avx2 = ebx & 0x20; |
||||
} |
||||
|
||||
// check AVX512 bits if the OS supports saving ZMM registers
|
||||
if (features->has_os_save_zmm) { |
||||
features->has_avx512 = ebx & 0x00010000; |
||||
features->has_avx512vnni = ecx & 0x800; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,24 @@ |
||||
/* x86_features.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef X86_FEATURES_H_ |
||||
#define X86_FEATURES_H_ |
||||
|
||||
struct x86_cpu_features { |
||||
int has_avx2; |
||||
int has_avx512; |
||||
int has_avx512vnni; |
||||
int has_sse2; |
||||
int has_ssse3; |
||||
int has_sse42; |
||||
int has_pclmulqdq; |
||||
int has_vpclmulqdq; |
||||
int has_os_save_ymm; |
||||
int has_os_save_zmm; |
||||
}; |
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features); |
||||
|
||||
#endif /* CPU_H_ */ |
@ -0,0 +1,87 @@ |
||||
#ifndef X86_INTRINS_H |
||||
#define X86_INTRINS_H |
||||
|
||||
/* Unfortunately GCC didn't support these things until version 10.
|
||||
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3. |
||||
*/ |
||||
#ifdef __AVX2__ |
||||
#include <immintrin.h> |
||||
|
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \ |
||||
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039) |
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) { |
||||
__m128i r; |
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); |
||||
return _mm256_castsi128_si256(r); |
||||
} |
||||
|
||||
#ifdef __AVX512F__ |
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) { |
||||
__m128i r; |
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); |
||||
return _mm512_castsi128_si512(r); |
||||
} |
||||
#endif // __AVX512F__
|
||||
#endif // gcc/AppleClang version test
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
/* GCC <9 is missing some AVX512 intrinsics.
|
||||
*/ |
||||
#ifdef __AVX512F__ |
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9) |
||||
#include <immintrin.h> |
||||
|
||||
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \ |
||||
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3))) |
||||
|
||||
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60, |
||||
char __q59, char __q58, char __q57, char __q56, |
||||
char __q55, char __q54, char __q53, char __q52, |
||||
char __q51, char __q50, char __q49, char __q48, |
||||
char __q47, char __q46, char __q45, char __q44, |
||||
char __q43, char __q42, char __q41, char __q40, |
||||
char __q39, char __q38, char __q37, char __q36, |
||||
char __q35, char __q34, char __q33, char __q32, |
||||
char __q31, char __q30, char __q29, char __q28, |
||||
char __q27, char __q26, char __q25, char __q24, |
||||
char __q23, char __q22, char __q21, char __q20, |
||||
char __q19, char __q18, char __q17, char __q16, |
||||
char __q15, char __q14, char __q13, char __q12, |
||||
char __q11, char __q10, char __q09, char __q08, |
||||
char __q07, char __q06, char __q05, char __q04, |
||||
char __q03, char __q02, char __q01, char __q00) { |
||||
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56), |
||||
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48), |
||||
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40), |
||||
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32), |
||||
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24), |
||||
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16), |
||||
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08), |
||||
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00)); |
||||
} |
||||
|
||||
#undef PACK |
||||
|
||||
#endif // gcc version test
|
||||
#endif // __AVX512F__
|
||||
|
||||
/* Missing zero-extension AVX and AVX512 intrinsics.
|
||||
* Fixed in Microsoft Visual Studio 2017 version 15.7 |
||||
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
|
||||
*/ |
||||
#if defined(_MSC_VER) && _MSC_VER < 1914 |
||||
#ifdef __AVX2__ |
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) { |
||||
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0); |
||||
} |
||||
#endif // __AVX2__
|
||||
|
||||
#ifdef __AVX512F__ |
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) { |
||||
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0); |
||||
} |
||||
#endif // __AVX512F__
|
||||
#endif // defined(_MSC_VER) && _MSC_VER < 1914
|
||||
|
||||
#endif // include guard X86_INTRINS_H
|
@ -0,0 +1,42 @@ |
||||
/* chunkset.c -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
|
||||
typedef uint64_t chunk_t; |
||||
|
||||
#define CHUNK_SIZE 8 |
||||
|
||||
#define HAVE_CHUNKMEMSET_4 |
||||
#define HAVE_CHUNKMEMSET_8 |
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
||||
uint8_t *dest = (uint8_t *)chunk; |
||||
memcpy(dest, from, sizeof(uint32_t)); |
||||
memcpy(dest+4, from, sizeof(uint32_t)); |
||||
} |
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
||||
memcpy(chunk, from, sizeof(uint64_t)); |
||||
} |
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
||||
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t)); |
||||
} |
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
||||
memcpy(out, chunk, sizeof(uint64_t)); |
||||
} |
||||
|
||||
#define CHUNKSIZE chunksize_c |
||||
#define CHUNKCOPY chunkcopy_c |
||||
#define CHUNKUNROLL chunkunroll_c |
||||
#define CHUNKMEMSET chunkmemset_c |
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_c |
||||
|
||||
#include "chunkset_tpl.h" |
||||
|
||||
#define INFLATE_FAST inflate_fast_c |
||||
|
||||
#include "inffast_tpl.h" |
@ -0,0 +1,200 @@ |
||||
/* chunkset_tpl.h -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include <stdlib.h> |
||||
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) |
||||
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len); |
||||
#endif |
||||
|
||||
/* Returns the chunk size */ |
||||
Z_INTERNAL uint32_t CHUNKSIZE(void) { |
||||
return sizeof(chunk_t); |
||||
} |
||||
|
||||
/* Behave like memcpy, but assume that it's OK to overwrite at least
|
||||
chunk_t bytes of output even if the length is shorter than this, |
||||
that the length is non-zero, and that `from` lags `out` by at least |
||||
sizeof chunk_t bytes (or that they don't overlap at all or simply that |
||||
the distance is less than the length of the copy). |
||||
|
||||
Aside from better memory bus utilisation, this means that short copies |
||||
(chunk_t bytes or fewer) will fall straight through the loop |
||||
without iteration, which will hopefully make the branch prediction more |
||||
reliable. */ |
||||
#ifndef HAVE_CHUNKCOPY |
||||
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { |
||||
Assert(len > 0, "chunkcopy should never have a length 0"); |
||||
chunk_t chunk; |
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; |
||||
loadchunk(from, &chunk); |
||||
storechunk(out, &chunk); |
||||
out += align; |
||||
from += align; |
||||
len -= align; |
||||
while (len > 0) { |
||||
loadchunk(from, &chunk); |
||||
storechunk(out, &chunk); |
||||
out += sizeof(chunk_t); |
||||
from += sizeof(chunk_t); |
||||
len -= sizeof(chunk_t); |
||||
} |
||||
return out; |
||||
} |
||||
#endif |
||||
|
||||
/* Perform short copies until distance can be rewritten as being at least
|
||||
sizeof chunk_t. |
||||
|
||||
This assumes that it's OK to overwrite at least the first |
||||
2*sizeof(chunk_t) bytes of output even if the copy is shorter than this. |
||||
This assumption holds because inflate_fast() starts every iteration with at |
||||
least 258 bytes of output space available (258 being the maximum length |
||||
output from a single token; see inflate_fast()'s assumptions below). */ |
||||
#ifndef HAVE_CHUNKUNROLL |
||||
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { |
||||
unsigned char const *from = out - *dist; |
||||
chunk_t chunk; |
||||
while (*dist < *len && *dist < sizeof(chunk_t)) { |
||||
loadchunk(from, &chunk); |
||||
storechunk(out, &chunk); |
||||
out += *dist; |
||||
*len -= *dist; |
||||
*dist += *dist; |
||||
} |
||||
return out; |
||||
} |
||||
#endif |
||||
|
||||
#ifndef HAVE_CHUNK_MAG |
||||
/* Loads a magazine to feed into memory of the pattern */ |
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { |
||||
/* This code takes string of length dist from "from" and repeats
|
||||
* it for as many times as can fit in a chunk_t (vector register) */ |
||||
uint32_t cpy_dist; |
||||
uint32_t bytes_remaining = sizeof(chunk_t); |
||||
chunk_t chunk_load; |
||||
uint8_t *cur_chunk = (uint8_t *)&chunk_load; |
||||
while (bytes_remaining) { |
||||
cpy_dist = MIN(dist, bytes_remaining); |
||||
memcpy(cur_chunk, buf, cpy_dist); |
||||
bytes_remaining -= cpy_dist; |
||||
cur_chunk += cpy_dist; |
||||
/* This allows us to bypass an expensive integer division since we're effectively
|
||||
* counting in this loop, anyway */ |
||||
*chunk_rem = cpy_dist; |
||||
} |
||||
|
||||
return chunk_load; |
||||
} |
||||
#endif |
||||
|
||||
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
|
||||
Return OUT + LEN. */ |
||||
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { |
||||
/* Debug performance related issues when len < sizeof(uint64_t):
|
||||
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ |
||||
Assert(dist > 0, "chunkmemset cannot have a distance 0"); |
||||
/* Only AVX2 */ |
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) |
||||
if (len <= 16) { |
||||
return chunkmemset_ssse3(out, dist, len); |
||||
} |
||||
#endif |
||||
|
||||
uint8_t *from = out - dist; |
||||
|
||||
if (dist == 1) { |
||||
memset(out, *from, len); |
||||
return out + len; |
||||
} else if (dist > sizeof(chunk_t)) { |
||||
return CHUNKCOPY(out, out - dist, len); |
||||
} |
||||
|
||||
chunk_t chunk_load; |
||||
uint32_t chunk_mod = 0; |
||||
|
||||
/* TODO: possibly build up a permutation table for this if not an even modulus */ |
||||
#ifdef HAVE_CHUNKMEMSET_2 |
||||
if (dist == 2) { |
||||
chunkmemset_2(from, &chunk_load); |
||||
} else |
||||
#endif |
||||
#ifdef HAVE_CHUNKMEMSET_4 |
||||
if (dist == 4) { |
||||
chunkmemset_4(from, &chunk_load); |
||||
} else |
||||
#endif |
||||
#ifdef HAVE_CHUNKMEMSET_8 |
||||
if (dist == 8) { |
||||
chunkmemset_8(from, &chunk_load); |
||||
} else if (dist == sizeof(chunk_t)) { |
||||
loadchunk(from, &chunk_load); |
||||
} else |
||||
#endif |
||||
{ |
||||
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist); |
||||
} |
||||
|
||||
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
|
||||
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */ |
||||
if (chunk_mod == 0) { |
||||
while (len >= (2 * sizeof(chunk_t))) { |
||||
storechunk(out, &chunk_load); |
||||
storechunk(out + sizeof(chunk_t), &chunk_load); |
||||
out += 2 * sizeof(chunk_t); |
||||
len -= 2 * sizeof(chunk_t); |
||||
} |
||||
} |
||||
|
||||
/* If we don't have a "dist" length that divides evenly into a vector
|
||||
* register, we can write the whole vector register but we need only |
||||
* advance by the amount of the whole string that fits in our chunk_t. |
||||
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/ |
||||
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod; |
||||
while (len >= sizeof(chunk_t)) { |
||||
storechunk(out, &chunk_load); |
||||
len -= adv_amount; |
||||
out += adv_amount; |
||||
} |
||||
|
||||
if (len) { |
||||
memcpy(out, &chunk_load, len); |
||||
out += len; |
||||
} |
||||
|
||||
return out; |
||||
} |
||||
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) { |
||||
#if !defined(UNALIGNED64_OK) |
||||
# if !defined(UNALIGNED_OK) |
||||
static const uint32_t align_mask = 7; |
||||
# else |
||||
static const uint32_t align_mask = 3; |
||||
# endif |
||||
#endif |
||||
|
||||
len = MIN(len, left); |
||||
uint8_t *from = out - dist; |
||||
#if !defined(UNALIGNED64_OK) |
||||
while (((uintptr_t)out & align_mask) && (len > 0)) { |
||||
*out++ = *from++; |
||||
--len; |
||||
--left; |
||||
} |
||||
#endif |
||||
if (left < (unsigned)(3 * sizeof(chunk_t))) { |
||||
while (len > 0) { |
||||
*out++ = *from++; |
||||
--len; |
||||
} |
||||
return out; |
||||
} |
||||
if (len) |
||||
return CHUNKMEMSET(out, dist, len); |
||||
|
||||
return out; |
||||
} |
@ -0,0 +1,543 @@ |
||||
# detect-intrinsics.cmake -- Detect compiler intrinsics support |
||||
# Licensed under the Zlib license, see LICENSE.md for details |
||||
|
||||
macro(check_acle_compiler_flag) |
||||
if(MSVC) |
||||
# Both ARM and ARM64-targeting msvc support intrinsics, but |
||||
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32 |
||||
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64") |
||||
set(HAVE_ACLE_FLAG TRUE) |
||||
endif() |
||||
else() |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports ACLE flag |
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"int main() { return 0; }" |
||||
HAVE_ACLE_FLAG FAIL_REGEX "not supported") |
||||
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG) |
||||
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE) |
||||
# Check whether compiler supports ACLE flag |
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}") |
||||
check_c_source_compiles( |
||||
"int main() { return 0; }" |
||||
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported") |
||||
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE) |
||||
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable |
||||
endif() |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endif() |
||||
endmacro() |
||||
|
||||
macro(check_armv6_compiler_flag) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6) |
||||
if(HAVE_MARCH_ARMV6) |
||||
set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports ARMv6 inline asm |
||||
set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"unsigned int f(unsigned int a, unsigned int b) { |
||||
unsigned int c; |
||||
__asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) ); |
||||
return (int)c; |
||||
} |
||||
int main(void) { return f(1,2); }" |
||||
HAVE_ARMV6_INLINE_ASM |
||||
) |
||||
# Check whether compiler supports ARMv6 intrinsics |
||||
check_c_source_compiles( |
||||
"#if defined(_MSC_VER) |
||||
#include <intrin.h> |
||||
#else |
||||
#include <arm_acle.h> |
||||
#endif |
||||
unsigned int f(unsigned int a, unsigned int b) { |
||||
#if defined(_MSC_VER) |
||||
return _arm_uqsub16(a, b); |
||||
#else |
||||
return __uqsub16(a, b); |
||||
#endif |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_ARMV6_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_avx512_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") |
||||
else() |
||||
set(AVX512FLAG "/arch:AVX512") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal |
||||
# instruction scheduling unless you specify a reasonable -mtune= target |
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") |
||||
if(NOT MSVC) |
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) |
||||
if(HAVE_CASCADE_LAKE) |
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake") |
||||
else() |
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512") |
||||
endif() |
||||
unset(HAVE_CASCADE_LAKE) |
||||
endif() |
||||
endif() |
||||
elseif(MSVC) |
||||
set(AVX512FLAG "/arch:AVX512") |
||||
endif() |
||||
# Check whether compiler supports AVX512 intrinsics |
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__m512i f(__m512i y) { |
||||
__m512i x = _mm512_set1_epi8(2); |
||||
return _mm512_sub_epi8(x, y); |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_AVX512_INTRIN |
||||
) |
||||
|
||||
# Evidently both GCC and clang were late to implementing these |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__mmask16 f(__mmask16 x) { return _knot_mask16(x); } |
||||
int main(void) { return 0; }" |
||||
HAVE_MASK_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_avx512vnni_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") |
||||
else() |
||||
set(AVX512VNNIFLAG "/arch:AVX512") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni") |
||||
if(NOT MSVC) |
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) |
||||
if(HAVE_CASCADE_LAKE) |
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake") |
||||
else() |
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512") |
||||
endif() |
||||
unset(HAVE_CASCADE_LAKE) |
||||
endif() |
||||
endif() |
||||
elseif(MSVC) |
||||
set(AVX512VNNIFLAG "/arch:AVX512") |
||||
endif() |
||||
|
||||
# Check whether compiler supports AVX512vnni intrinsics |
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__m512i f(__m512i x, __m512i y) { |
||||
__m512i z = _mm512_setzero_epi32(); |
||||
return _mm512_dpbusd_epi32(z, x, y); |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_AVX512VNNI_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_avx2_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(AVX2FLAG "-mavx2") |
||||
else() |
||||
set(AVX2FLAG "/arch:AVX2") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(AVX2FLAG "-mavx2") |
||||
endif() |
||||
elseif(MSVC) |
||||
set(AVX2FLAG "/arch:AVX2") |
||||
endif() |
||||
# Check whether compiler supports AVX2 intrinics |
||||
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__m256i f(__m256i x) { |
||||
const __m256i y = _mm256_set1_epi16(1); |
||||
return _mm256_subs_epu16(x, y); |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_AVX2_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_neon_compiler_flag) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
if("${ARCH}" MATCHES "aarch64") |
||||
set(NEONFLAG "-march=armv8-a+simd") |
||||
else() |
||||
set(NEONFLAG "-mfpu=neon") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports NEON flag |
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#if defined(_M_ARM64) || defined(_M_ARM64EC) |
||||
# include <arm64_neon.h> |
||||
#else |
||||
# include <arm_neon.h> |
||||
#endif |
||||
int main() { return 0; }" |
||||
NEON_AVAILABLE FAIL_REGEX "not supported") |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_neon_ld4_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
if("${ARCH}" MATCHES "aarch64") |
||||
set(NEONFLAG "-march=armv8-a+simd") |
||||
else() |
||||
set(NEONFLAG "-mfpu=neon") |
||||
endif() |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports loading 4 neon vecs into a register range |
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) |
||||
# include <arm64_neon.h> |
||||
#else |
||||
# include <arm_neon.h> |
||||
#endif |
||||
int32x4x4_t f(int var[16]) { return vld1q_s32_x4(var); } |
||||
int main(void) { return 0; }" |
||||
NEON_HAS_LD4) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_pclmulqdq_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(PCLMULFLAG "-mpclmul") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports PCLMULQDQ intrinsics |
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386")) |
||||
# The pclmul code currently crashes on Mac in 32bit mode. Avoid for now. |
||||
set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
#include <wmmintrin.h> |
||||
__m128i f(__m128i a, __m128i b) { return _mm_clmulepi64_si128(a, b, 0x10); } |
||||
int main(void) { return 0; }" |
||||
HAVE_PCLMULQDQ_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
else() |
||||
set(HAVE_PCLMULQDQ_INTRIN OFF) |
||||
endif() |
||||
endmacro() |
||||
|
||||
macro(check_vpclmulqdq_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports VPCLMULQDQ intrinsics |
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386")) |
||||
set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
#include <wmmintrin.h> |
||||
__m512i f(__m512i a) { |
||||
__m512i b = _mm512_setzero_si512(); |
||||
return _mm512_clmulepi64_epi128(a, b, 0x10); |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_VPCLMULQDQ_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
else() |
||||
set(HAVE_VPCLMULQDQ_INTRIN OFF) |
||||
endif() |
||||
endmacro() |
||||
|
||||
macro(check_ppc_intrinsics) |
||||
# Check if compiler supports AltiVec |
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <altivec.h> |
||||
int main(void) |
||||
{ |
||||
vector int a = vec_splats(0); |
||||
vector int b = vec_splats(0); |
||||
a = vec_add(a, b); |
||||
return 0; |
||||
}" |
||||
HAVE_ALTIVEC |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
|
||||
if(HAVE_ALTIVEC) |
||||
set(PPCFLAGS "-maltivec") |
||||
endif() |
||||
|
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <altivec.h> |
||||
int main(void) |
||||
{ |
||||
vector int a = vec_splats(0); |
||||
vector int b = vec_splats(0); |
||||
a = vec_add(a, b); |
||||
return 0; |
||||
}" |
||||
HAVE_NOVSX |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
|
||||
if(HAVE_NOVSX) |
||||
set(PPCFLAGS "${PPCFLAGS} -mno-vsx") |
||||
endif() |
||||
|
||||
# Check if we have what we need for AltiVec optimizations |
||||
set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
#ifdef __FreeBSD__ |
||||
#include <machine/cpu.h> |
||||
#endif |
||||
int main() { |
||||
#ifdef __FreeBSD__ |
||||
unsigned long hwcap; |
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); |
||||
return (hwcap & PPC_FEATURE_HAS_ALTIVEC); |
||||
#else |
||||
return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC); |
||||
#endif |
||||
}" |
||||
HAVE_VMX |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_power8_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(POWER8FLAG "-mcpu=power8") |
||||
endif() |
||||
endif() |
||||
# Check if we have what we need for POWER8 optimizations |
||||
set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
#ifdef __FreeBSD__ |
||||
#include <machine/cpu.h> |
||||
#endif |
||||
int main() { |
||||
#ifdef __FreeBSD__ |
||||
unsigned long hwcap; |
||||
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap)); |
||||
return (hwcap & PPC_FEATURE2_ARCH_2_07); |
||||
#else |
||||
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); |
||||
#endif |
||||
}" |
||||
HAVE_POWER8_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_rvv_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(RISCVFLAG "-march=rv64gcv") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports RVV |
||||
set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <riscv_vector.h> |
||||
int main() { |
||||
return 0; |
||||
}" |
||||
HAVE_RVV_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_s390_intrinsics) |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
#ifndef HWCAP_S390_VXRS |
||||
#define HWCAP_S390_VXRS HWCAP_S390_VX |
||||
#endif |
||||
int main() { |
||||
return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS); |
||||
}" |
||||
HAVE_S390_INTRIN |
||||
) |
||||
endmacro() |
||||
|
||||
macro(check_power9_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(POWER9FLAG "-mcpu=power9") |
||||
endif() |
||||
endif() |
||||
# Check if we have what we need for POWER9 optimizations |
||||
set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <sys/auxv.h> |
||||
#ifdef __FreeBSD__ |
||||
#include <machine/cpu.h> |
||||
#endif |
||||
int main() { |
||||
#ifdef __FreeBSD__ |
||||
unsigned long hwcap; |
||||
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap)); |
||||
return (hwcap & PPC_FEATURE2_ARCH_3_00); |
||||
#else |
||||
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00); |
||||
#endif |
||||
}" |
||||
HAVE_POWER9_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_sse2_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(SSE2FLAG "-msse2") |
||||
else() |
||||
set(SSE2FLAG "/arch:SSE2") |
||||
endif() |
||||
elseif(MSVC) |
||||
if(NOT "${ARCH}" MATCHES "x86_64") |
||||
set(SSE2FLAG "/arch:SSE2") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(SSE2FLAG "-msse2") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports SSE2 intrinsics |
||||
set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__m128i f(__m128i x, __m128i y) { return _mm_sad_epu8(x, y); } |
||||
int main(void) { return 0; }" |
||||
HAVE_SSE2_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_ssse3_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(SSSE3FLAG "-mssse3") |
||||
else() |
||||
set(SSSE3FLAG "/arch:SSSE3") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(SSSE3FLAG "-mssse3") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports SSSE3 intrinsics |
||||
set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <immintrin.h> |
||||
__m128i f(__m128i u) { |
||||
__m128i v = _mm_set1_epi32(1); |
||||
return _mm_hadd_epi32(u, v); |
||||
} |
||||
int main(void) { return 0; }" |
||||
HAVE_SSSE3_INTRIN |
||||
) |
||||
endmacro() |
||||
|
||||
macro(check_sse42_intrinsics) |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel") |
||||
if(CMAKE_HOST_UNIX OR APPLE) |
||||
set(SSE42FLAG "-msse4.2") |
||||
else() |
||||
set(SSE42FLAG "/arch:SSE4.2") |
||||
endif() |
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
if(NOT NATIVEFLAG) |
||||
set(SSE42FLAG "-msse4.2") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports SSE4.2 intrinsics |
||||
set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <nmmintrin.h> |
||||
unsigned int f(unsigned int a, unsigned int b) { return _mm_crc32_u32(a, b); } |
||||
int main(void) { return 0; }" |
||||
HAVE_SSE42_INTRIN |
||||
) |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_vgfma_intrinsics) |
||||
if(NOT NATIVEFLAG) |
||||
set(VGFMAFLAG "-march=z13") |
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU") |
||||
set(VGFMAFLAG "${VGFMAFLAG} -mzarch") |
||||
endif() |
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang") |
||||
set(VGFMAFLAG "${VGFMAFLAG} -fzvector") |
||||
endif() |
||||
endif() |
||||
# Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic |
||||
set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#include <vecintrin.h> |
||||
int main(void) { |
||||
unsigned long long a __attribute__((vector_size(16))) = { 0 }; |
||||
unsigned long long b __attribute__((vector_size(16))) = { 0 }; |
||||
unsigned char c __attribute__((vector_size(16))) = { 0 }; |
||||
c = vec_gfmsum_accum_128(a, b, c); |
||||
return c[0]; |
||||
}" |
||||
HAVE_VGFMA_INTRIN FAIL_REGEX "not supported") |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
||||
|
||||
macro(check_xsave_intrinsics) |
||||
if(NOT NATIVEFLAG AND NOT MSVC) |
||||
set(XSAVEFLAG "-mxsave") |
||||
endif() |
||||
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") |
||||
check_c_source_compiles( |
||||
"#ifdef _MSC_VER |
||||
# include <intrin.h> |
||||
#else |
||||
# include <x86gprintrin.h> |
||||
#endif |
||||
unsigned int f(unsigned int a) { return (int) _xgetbv(a); } |
||||
int main(void) { return 0; }" |
||||
HAVE_XSAVE_INTRIN FAIL_REGEX "not supported") |
||||
set(CMAKE_REQUIRED_FLAGS) |
||||
endmacro() |
@ -0,0 +1,19 @@ |
||||
# fallback-macros.cmake -- CMake fallback macros |
||||
# Copyright (C) 2022 Nathan Moinvaziri |
||||
# Licensed under the Zlib license, see LICENSE.md for details |
||||
|
||||
# CMake less than version 3.5.2 |
||||
if(NOT COMMAND add_compile_options) |
||||
macro(add_compile_options options) |
||||
string(APPEND CMAKE_C_FLAGS ${options}) |
||||
string(APPEND CMAKE_CXX_FLAGS ${options}) |
||||
endmacro() |
||||
endif() |
||||
|
||||
# CMake less than version 3.14 |
||||
if(NOT COMMAND add_link_options) |
||||
macro(add_link_options options) |
||||
string(APPEND CMAKE_EXE_LINKER_FLAGS ${options}) |
||||
string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options}) |
||||
endmacro() |
||||
endif() |
@ -0,0 +1,180 @@ |
||||
/* compare256.c -- 256 byte memory comparison with match length return
|
||||
* Copyright (C) 2020 Nathan Moinvaziri |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil_p.h" |
||||
#include "fallback_builtins.h" |
||||
|
||||
/* ALIGNED, byte comparison */ |
||||
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src0 += 1, src1 += 1, len += 1; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_c_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_c |
||||
#define COMPARE256 compare256_c_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_c |
||||
#define COMPARE256 compare256_c_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN |
||||
/* 16-bit unaligned integer comparison */ |
||||
static inline uint32_t compare256_unaligned_16_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
if (zng_memcmp_2(src0, src1) != 0) |
||||
return len + (*src0 == *src1); |
||||
src0 += 2, src1 += 2, len += 2; |
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0) |
||||
return len + (*src0 == *src1); |
||||
src0 += 2, src1 += 2, len += 2; |
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0) |
||||
return len + (*src0 == *src1); |
||||
src0 += 2, src1 += 2, len += 2; |
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0) |
||||
return len + (*src0 == *src1); |
||||
src0 += 2, src1 += 2, len += 2; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_unaligned_16_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_16 |
||||
#define COMPARE256 compare256_unaligned_16_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_unaligned_16 |
||||
#define COMPARE256 compare256_unaligned_16_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ |
||||
/* 32-bit unaligned integer comparison */ |
||||
static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
uint32_t sv, mv, diff; |
||||
|
||||
memcpy(&sv, src0, sizeof(sv)); |
||||
memcpy(&mv, src1, sizeof(mv)); |
||||
|
||||
diff = sv ^ mv; |
||||
if (diff) { |
||||
uint32_t match_byte = __builtin_ctz(diff) / 8; |
||||
return len + match_byte; |
||||
} |
||||
|
||||
src0 += 4, src1 += 4, len += 4; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_unaligned_32_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_32 |
||||
#define COMPARE256 compare256_unaligned_32_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_unaligned_32 |
||||
#define COMPARE256 compare256_unaligned_32_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
||||
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
/* UNALIGNED64_OK, 64-bit integer comparison */ |
||||
static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
uint64_t sv, mv, diff; |
||||
|
||||
memcpy(&sv, src0, sizeof(sv)); |
||||
memcpy(&mv, src1, sizeof(mv)); |
||||
|
||||
diff = sv ^ mv; |
||||
if (diff) { |
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8; |
||||
return len + (uint32_t)match_byte; |
||||
} |
||||
|
||||
src0 += 8, src1 += 8, len += 8; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1) { |
||||
return compare256_unaligned_64_static(src0, src1); |
||||
} |
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_64 |
||||
#define COMPARE256 compare256_unaligned_64_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#define LONGEST_MATCH_SLOW |
||||
#define LONGEST_MATCH longest_match_slow_unaligned_64 |
||||
#define COMPARE256 compare256_unaligned_64_static |
||||
|
||||
#include "match_tpl.h" |
||||
|
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,134 @@ |
||||
/* compare256_rle.h -- 256 byte run-length encoding comparison
|
||||
* Copyright (C) 2022 Nathan Moinvaziri |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "fallback_builtins.h" |
||||
|
||||
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1); |
||||
|
||||
/* ALIGNED, byte comparison */ |
||||
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
|
||||
do { |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
if (*src0 != *src1) |
||||
return len; |
||||
src1 += 1, len += 1; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
#ifdef UNALIGNED_OK |
||||
/* 16-bit unaligned integer comparison */ |
||||
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t len = 0; |
||||
uint16_t src0_cmp, src1_cmp; |
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp)); |
||||
|
||||
do { |
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp)); |
||||
if (src0_cmp != src1_cmp) |
||||
return len + (*src0 == *src1); |
||||
src1 += 2, len += 2; |
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp)); |
||||
if (src0_cmp != src1_cmp) |
||||
return len + (*src0 == *src1); |
||||
src1 += 2, len += 2; |
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp)); |
||||
if (src0_cmp != src1_cmp) |
||||
return len + (*src0 == *src1); |
||||
src1 += 2, len += 2; |
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp)); |
||||
if (src0_cmp != src1_cmp) |
||||
return len + (*src0 == *src1); |
||||
src1 += 2, len += 2; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ |
||||
/* 32-bit unaligned integer comparison */ |
||||
static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t sv, len = 0; |
||||
uint16_t src0_cmp; |
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp)); |
||||
sv = ((uint32_t)src0_cmp << 16) | src0_cmp; |
||||
|
||||
do { |
||||
uint32_t mv, diff; |
||||
|
||||
memcpy(&mv, src1, sizeof(mv)); |
||||
|
||||
diff = sv ^ mv; |
||||
if (diff) { |
||||
uint32_t match_byte = __builtin_ctz(diff) / 8; |
||||
return len + match_byte; |
||||
} |
||||
|
||||
src1 += 4, len += 4; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
/* 64-bit unaligned integer comparison */ |
||||
static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) { |
||||
uint32_t src0_cmp32, len = 0; |
||||
uint16_t src0_cmp; |
||||
uint64_t sv; |
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp)); |
||||
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp; |
||||
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32; |
||||
|
||||
do { |
||||
uint64_t mv, diff; |
||||
|
||||
memcpy(&mv, src1, sizeof(mv)); |
||||
|
||||
diff = sv ^ mv; |
||||
if (diff) { |
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8; |
||||
return len + (uint32_t)match_byte; |
||||
} |
||||
|
||||
src1 += 8, len += 8; |
||||
} while (len < 256); |
||||
|
||||
return 256; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#endif |
||||
|
@ -0,0 +1,98 @@ |
||||
/* compress.c -- compress a memory buffer
|
||||
* Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil.h" |
||||
|
||||
/* ===========================================================================
|
||||
* Architecture-specific hooks. |
||||
*/ |
||||
#ifdef S390_DFLTCC_DEFLATE |
||||
# include "arch/s390/dfltcc_common.h" |
||||
#else |
||||
/* Returns the upper bound on compressed data length based on uncompressed data length, assuming default settings.
|
||||
* Zero means that arch-specific deflation code behaves identically to the regular zlib-ng algorithms. */ |
||||
# define DEFLATE_BOUND_COMPLEN(source_len) 0 |
||||
#endif |
||||
|
||||
/* ===========================================================================
|
||||
Compresses the source buffer into the destination buffer. The level |
||||
parameter has the same meaning as in deflateInit. sourceLen is the byte |
||||
length of the source buffer. Upon entry, destLen is the total size of the |
||||
destination buffer, which must be at least 0.1% larger than sourceLen plus |
||||
12 bytes. Upon exit, destLen is the actual size of the compressed buffer. |
||||
|
||||
compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough |
||||
memory, Z_BUF_ERROR if there was not enough room in the output buffer, |
||||
Z_STREAM_ERROR if the level parameter is invalid. |
||||
*/ |
||||
int Z_EXPORT PREFIX(compress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, |
||||
z_uintmax_t sourceLen, int level) { |
||||
PREFIX3(stream) stream; |
||||
int err; |
||||
const unsigned int max = (unsigned int)-1; |
||||
z_size_t left; |
||||
|
||||
left = *destLen; |
||||
*destLen = 0; |
||||
|
||||
stream.zalloc = NULL; |
||||
stream.zfree = NULL; |
||||
stream.opaque = NULL; |
||||
|
||||
err = PREFIX(deflateInit)(&stream, level); |
||||
if (err != Z_OK) |
||||
return err; |
||||
|
||||
stream.next_out = dest; |
||||
stream.avail_out = 0; |
||||
stream.next_in = (z_const unsigned char *)source; |
||||
stream.avail_in = 0; |
||||
|
||||
do { |
||||
if (stream.avail_out == 0) { |
||||
stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left; |
||||
left -= stream.avail_out; |
||||
} |
||||
if (stream.avail_in == 0) { |
||||
stream.avail_in = sourceLen > (unsigned long)max ? max : (unsigned int)sourceLen; |
||||
sourceLen -= stream.avail_in; |
||||
} |
||||
err = PREFIX(deflate)(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH); |
||||
} while (err == Z_OK); |
||||
|
||||
*destLen = stream.total_out; |
||||
PREFIX(deflateEnd)(&stream); |
||||
return err == Z_STREAM_END ? Z_OK : err; |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
*/ |
||||
int Z_EXPORT PREFIX(compress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) { |
||||
return PREFIX(compress2)(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION); |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
If the default memLevel or windowBits for deflateInit() is changed, then |
||||
this function needs to be updated. |
||||
*/ |
||||
z_uintmax_t Z_EXPORT PREFIX(compressBound)(z_uintmax_t sourceLen) { |
||||
z_uintmax_t complen = DEFLATE_BOUND_COMPLEN(sourceLen); |
||||
|
||||
if (complen > 0) |
||||
/* Architecture-specific code provided an upper bound. */ |
||||
return complen + ZLIB_WRAPLEN; |
||||
|
||||
#ifndef NO_QUICK_STRATEGY |
||||
return sourceLen /* The source size itself */ |
||||
+ (sourceLen == 0 ? 1 : 0) /* Always at least one byte for any input */ |
||||
+ (sourceLen < 9 ? 1 : 0) /* One extra byte for lengths less than 9 */ |
||||
+ DEFLATE_QUICK_OVERHEAD(sourceLen) /* Source encoding overhead, padded to next full byte */ |
||||
+ DEFLATE_BLOCK_OVERHEAD /* Deflate block overhead bytes */ |
||||
+ ZLIB_WRAPLEN; /* zlib wrapper */ |
||||
#else |
||||
return sourceLen + (sourceLen >> 4) + 7 + ZLIB_WRAPLEN; |
||||
#endif |
||||
} |
@ -0,0 +1,23 @@ |
||||
/* cpu_features.c -- CPU architecture feature check
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "cpu_features.h" |
||||
#include <string.h> |
||||
|
||||
Z_INTERNAL void cpu_check_features(struct cpu_features *features) { |
||||
memset(features, 0, sizeof(struct cpu_features)); |
||||
#if defined(X86_FEATURES) |
||||
x86_check_features(&features->x86); |
||||
#elif defined(ARM_FEATURES) |
||||
arm_check_features(&features->arm); |
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) |
||||
power_check_features(&features->power); |
||||
#elif defined(S390_FEATURES) |
||||
s390_check_features(&features->s390); |
||||
#elif defined(RISCV_FEATURES) |
||||
riscv_check_features(&features->riscv); |
||||
#endif |
||||
} |
@ -0,0 +1,303 @@ |
||||
/* cpu_features.h -- CPU architecture feature check
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef CPU_FEATURES_H_ |
||||
#define CPU_FEATURES_H_ |
||||
|
||||
#include "adler32_fold.h" |
||||
#include "crc32_fold.h" |
||||
|
||||
#if defined(X86_FEATURES) |
||||
# include "arch/x86/x86_features.h" |
||||
# include "fallback_builtins.h" |
||||
#elif defined(ARM_FEATURES) |
||||
# include "arch/arm/arm_features.h" |
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) |
||||
# include "arch/power/power_features.h" |
||||
#elif defined(S390_FEATURES) |
||||
# include "arch/s390/s390_features.h" |
||||
#elif defined(RISCV_FEATURES) |
||||
# include "arch/riscv/riscv_features.h" |
||||
#endif |
||||
|
||||
struct cpu_features { |
||||
#if defined(X86_FEATURES) |
||||
struct x86_cpu_features x86; |
||||
#elif defined(ARM_FEATURES) |
||||
struct arm_cpu_features arm; |
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) |
||||
struct power_cpu_features power; |
||||
#elif defined(S390_FEATURES) |
||||
struct s390_cpu_features s390; |
||||
#elif defined(RISCV_FEATURES) |
||||
struct riscv_cpu_features riscv; |
||||
#else |
||||
char empty; |
||||
#endif |
||||
}; |
||||
|
||||
extern void cpu_check_features(struct cpu_features *features); |
||||
|
||||
/* adler32 */ |
||||
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); |
||||
|
||||
extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#ifdef ARM_NEON |
||||
extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef PPC_VMX |
||||
extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef X86_SSSE3 |
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX2 |
||||
extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX512 |
||||
extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX512VNNI |
||||
extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#ifdef POWER8_VSX |
||||
extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); |
||||
#endif |
||||
|
||||
/* adler32 folding */ |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
#ifdef X86_SSE42 |
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX2 |
||||
extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX512 |
||||
extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
#ifdef X86_AVX512VNNI |
||||
extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
#endif |
||||
|
||||
/* CRC32 folding */ |
||||
#ifdef X86_PCLMULQDQ_CRC |
||||
extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc); |
||||
extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); |
||||
extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); |
||||
extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc); |
||||
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); |
||||
#endif |
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) |
||||
extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc); |
||||
extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); |
||||
extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); |
||||
extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc); |
||||
extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); |
||||
#endif |
||||
|
||||
/* memory chunking */ |
||||
extern uint32_t chunksize_c(void); |
||||
extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#ifdef X86_SSE2 |
||||
extern uint32_t chunksize_sse2(void); |
||||
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
#ifdef X86_SSSE3 |
||||
extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
#ifdef X86_AVX2 |
||||
extern uint32_t chunksize_avx2(void); |
||||
extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
#ifdef ARM_NEON |
||||
extern uint32_t chunksize_neon(void); |
||||
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
#ifdef POWER8_VSX |
||||
extern uint32_t chunksize_power8(void); |
||||
extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t chunksize_rvv(void); |
||||
extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
#endif |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
typedef struct z_stream_s z_stream; |
||||
#else |
||||
typedef struct zng_stream_s zng_stream; |
||||
#endif |
||||
|
||||
/* inflate fast loop */ |
||||
extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); |
||||
#ifdef X86_SSE2 |
||||
extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
#ifdef X86_SSSE3 |
||||
extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
#ifdef X86_AVX2 |
||||
extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
#ifdef ARM_NEON |
||||
extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
#ifdef POWER8_VSX |
||||
extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); |
||||
#endif |
||||
|
||||
/* CRC32 */ |
||||
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); |
||||
|
||||
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); |
||||
#ifdef ARM_ACLE |
||||
extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len); |
||||
#elif defined(POWER8_VSX) |
||||
extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); |
||||
#elif defined(S390_CRC32_VX) |
||||
extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); |
||||
#endif |
||||
|
||||
/* compare256 */ |
||||
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1); |
||||
|
||||
extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); |
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN |
||||
extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1); |
||||
#ifdef HAVE_BUILTIN_CTZ |
||||
extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#endif |
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) |
||||
extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#ifdef POWER9 |
||||
extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); |
||||
#endif |
||||
|
||||
#ifdef DEFLATE_H_ |
||||
/* insert_string */ |
||||
extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); |
||||
#ifdef X86_SSE42 |
||||
extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count); |
||||
#elif defined(ARM_ACLE) |
||||
extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); |
||||
#endif |
||||
|
||||
/* longest_match */ |
||||
extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); |
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN |
||||
extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); |
||||
#ifdef HAVE_BUILTIN_CTZ |
||||
extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#endif |
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) |
||||
extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#ifdef POWER9 |
||||
extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
|
||||
/* longest_match_slow */ |
||||
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); |
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN |
||||
extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); |
||||
extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); |
||||
#ifdef UNALIGNED64_OK |
||||
extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#endif |
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
||||
extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) |
||||
extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#ifdef POWER9 |
||||
extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
#ifdef RISCV_RVV |
||||
extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match); |
||||
#endif |
||||
|
||||
/* quick_insert_string */ |
||||
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); |
||||
#ifdef X86_SSE42 |
||||
extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str); |
||||
#elif defined(ARM_ACLE) |
||||
extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); |
||||
#endif |
||||
|
||||
/* slide_hash */ |
||||
typedef void (*slide_hash_func)(deflate_state *s); |
||||
|
||||
#ifdef X86_SSE2 |
||||
extern void slide_hash_sse2(deflate_state *s); |
||||
#endif |
||||
#if defined(ARM_SIMD) |
||||
extern void slide_hash_armv6(deflate_state *s); |
||||
#endif |
||||
#if defined(ARM_NEON) |
||||
extern void slide_hash_neon(deflate_state *s); |
||||
#endif |
||||
#if defined(PPC_VMX) |
||||
extern void slide_hash_vmx(deflate_state *s); |
||||
#endif |
||||
#if defined(POWER8_VSX) |
||||
extern void slide_hash_power8(deflate_state *s); |
||||
#endif |
||||
#if defined(RISCV_RVV) |
||||
extern void slide_hash_rvv(deflate_state *s); |
||||
#endif |
||||
#ifdef X86_AVX2 |
||||
extern void slide_hash_avx2(deflate_state *s); |
||||
#endif |
||||
|
||||
/* update_hash */ |
||||
extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val); |
||||
#ifdef X86_SSE42 |
||||
extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val); |
||||
#elif defined(ARM_ACLE) |
||||
extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val); |
||||
#endif |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,267 @@ |
||||
/* crc32_braid.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2022 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
* This interleaved implementation of a CRC makes use of pipelined multiple |
||||
* arithmetic-logic units, commonly found in modern CPU cores. It is due to |
||||
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil.h" |
||||
#include "functable.h" |
||||
#include "crc32_braid_p.h" |
||||
#include "crc32_braid_tbl.h" |
||||
|
||||
/* ========================================================================= */ |
||||
|
||||
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) { |
||||
return (const uint32_t *)crc_table; |
||||
} |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) { |
||||
if (buf == NULL) return 0; |
||||
|
||||
return (unsigned long)functable.crc32((uint32_t)crc, buf, len); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) { |
||||
if (buf == NULL) return 0; |
||||
|
||||
return functable.crc32(crc, buf, len); |
||||
} |
||||
#endif |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) { |
||||
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) { |
||||
return PREFIX(crc32_z)(crc, buf, len); |
||||
} |
||||
#endif |
||||
|
||||
/* ========================================================================= */ |
||||
|
||||
/*
|
||||
A CRC of a message is computed on N braids of words in the message, where |
||||
each word consists of W bytes (4 or 8). If N is 3, for example, then three |
||||
running sparse CRCs are calculated respectively on each braid, at these |
||||
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... |
||||
This is done starting at a word boundary, and continues until as many blocks |
||||
of N * W bytes as are available have been processed. The results are combined |
||||
into a single CRC at the end. For this code, N must be in the range 1..6 and |
||||
W must be 4 or 8. The upper limit on N can be increased if desired by adding |
||||
more #if blocks, extending the patterns apparent in the code. In addition, |
||||
crc32 tables would need to be regenerated, if the maximum N value is increased. |
||||
|
||||
N and W are chosen empirically by benchmarking the execution time on a given |
||||
processor. The choices for N and W below were based on testing on Intel Kaby |
||||
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 |
||||
Octeon II processors. The Intel, AMD, and ARM processors were all fastest |
||||
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. |
||||
They were all tested with either gcc or clang, all using the -O3 optimization |
||||
level. Your mileage may vary. |
||||
*/ |
||||
|
||||
/* ========================================================================= */ |
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
# define ZSWAPWORD(word) (word) |
||||
# define BRAID_TABLE crc_braid_table |
||||
#elif BYTE_ORDER == BIG_ENDIAN |
||||
# if W == 8 |
||||
# define ZSWAPWORD(word) ZSWAP64(word) |
||||
# elif W == 4 |
||||
# define ZSWAPWORD(word) ZSWAP32(word) |
||||
# endif |
||||
# define BRAID_TABLE crc_braid_big_table |
||||
#else |
||||
# error "No endian defined" |
||||
#endif |
||||
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8) |
||||
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 |
||||
|
||||
/* ========================================================================= */ |
||||
#ifdef W |
||||
/*
|
||||
Return the CRC of the W bytes in the word_t data, taking the |
||||
least-significant byte of the word as the first byte of data, without any pre |
||||
or post conditioning. This is used to combine the CRCs of each braid. |
||||
*/ |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
static uint32_t crc_word(z_word_t data) { |
||||
int k; |
||||
for (k = 0; k < W; k++) |
||||
data = (data >> 8) ^ crc_table[data & 0xff]; |
||||
return (uint32_t)data; |
||||
} |
||||
#elif BYTE_ORDER == BIG_ENDIAN |
||||
static z_word_t crc_word(z_word_t data) { |
||||
int k; |
||||
for (k = 0; k < W; k++) |
||||
data = (data << 8) ^ |
||||
crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; |
||||
return data; |
||||
} |
||||
#endif /* BYTE_ORDER */ |
||||
|
||||
#endif /* W */ |
||||
|
||||
/* ========================================================================= */ |
||||
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) { |
||||
Z_REGISTER uint32_t c; |
||||
|
||||
/* Pre-condition the CRC */ |
||||
c = (~crc) & 0xffffffff; |
||||
|
||||
#ifdef W |
||||
/* If provided enough bytes, do a braided CRC calculation. */ |
||||
if (len >= N * W + W - 1) { |
||||
size_t blks; |
||||
z_word_t const *words; |
||||
int k; |
||||
|
||||
/* Compute the CRC up to a z_word_t boundary. */ |
||||
while (len && ((uintptr_t)buf & (W - 1)) != 0) { |
||||
len--; |
||||
DO1; |
||||
} |
||||
|
||||
/* Compute the CRC on as many N z_word_t blocks as are available. */ |
||||
blks = len / (N * W); |
||||
len -= blks * N * W; |
||||
words = (z_word_t const *)buf; |
||||
|
||||
z_word_t crc0, word0, comb; |
||||
#if N > 1 |
||||
z_word_t crc1, word1; |
||||
#if N > 2 |
||||
z_word_t crc2, word2; |
||||
#if N > 3 |
||||
z_word_t crc3, word3; |
||||
#if N > 4 |
||||
z_word_t crc4, word4; |
||||
#if N > 5 |
||||
z_word_t crc5, word5; |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
/* Initialize the CRC for each braid. */ |
||||
crc0 = ZSWAPWORD(c); |
||||
#if N > 1 |
||||
crc1 = 0; |
||||
#if N > 2 |
||||
crc2 = 0; |
||||
#if N > 3 |
||||
crc3 = 0; |
||||
#if N > 4 |
||||
crc4 = 0; |
||||
#if N > 5 |
||||
crc5 = 0; |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */ |
||||
while (--blks) { |
||||
/* Load the word for each braid into registers. */ |
||||
word0 = crc0 ^ words[0]; |
||||
#if N > 1 |
||||
word1 = crc1 ^ words[1]; |
||||
#if N > 2 |
||||
word2 = crc2 ^ words[2]; |
||||
#if N > 3 |
||||
word3 = crc3 ^ words[3]; |
||||
#if N > 4 |
||||
word4 = crc4 ^ words[4]; |
||||
#if N > 5 |
||||
word5 = crc5 ^ words[5]; |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
words += N; |
||||
|
||||
/* Compute and update the CRC for each word. The loop should get unrolled. */ |
||||
crc0 = BRAID_TABLE[0][word0 & 0xff]; |
||||
#if N > 1 |
||||
crc1 = BRAID_TABLE[0][word1 & 0xff]; |
||||
#if N > 2 |
||||
crc2 = BRAID_TABLE[0][word2 & 0xff]; |
||||
#if N > 3 |
||||
crc3 = BRAID_TABLE[0][word3 & 0xff]; |
||||
#if N > 4 |
||||
crc4 = BRAID_TABLE[0][word4 & 0xff]; |
||||
#if N > 5 |
||||
crc5 = BRAID_TABLE[0][word5 & 0xff]; |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
for (k = 1; k < W; k++) { |
||||
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; |
||||
#if N > 1 |
||||
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; |
||||
#if N > 2 |
||||
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; |
||||
#if N > 3 |
||||
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; |
||||
#if N > 4 |
||||
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; |
||||
#if N > 5 |
||||
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
/* Process the last block, combining the CRCs of the N braids at the same time. */ |
||||
comb = crc_word(crc0 ^ words[0]); |
||||
#if N > 1 |
||||
comb = crc_word(crc1 ^ words[1] ^ comb); |
||||
#if N > 2 |
||||
comb = crc_word(crc2 ^ words[2] ^ comb); |
||||
#if N > 3 |
||||
comb = crc_word(crc3 ^ words[3] ^ comb); |
||||
#if N > 4 |
||||
comb = crc_word(crc4 ^ words[4] ^ comb); |
||||
#if N > 5 |
||||
comb = crc_word(crc5 ^ words[5] ^ comb); |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
#endif |
||||
words += N; |
||||
c = ZSWAPWORD(comb); |
||||
|
||||
/* Update the pointer to the remaining bytes to process. */ |
||||
buf = (const unsigned char *)words; |
||||
} |
||||
|
||||
#endif /* W */ |
||||
|
||||
/* Complete the computation of the CRC on any remaining bytes. */ |
||||
while (len >= 8) { |
||||
len -= 8; |
||||
DO8; |
||||
} |
||||
while (len) { |
||||
len--; |
||||
DO1; |
||||
} |
||||
|
||||
/* Return the CRC, post-conditioned. */ |
||||
return c ^ 0xffffffff; |
||||
} |
@ -0,0 +1,57 @@ |
||||
/* crc32_braid_comb.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2022 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
* This interleaved implementation of a CRC makes use of pipelined multiple |
||||
* arithmetic-logic units, commonly found in modern CPU cores. It is due to |
||||
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil.h" |
||||
#include "crc32_braid_p.h" |
||||
#include "crc32_braid_tbl.h" |
||||
#include "crc32_braid_comb_p.h" |
||||
|
||||
/* ========================================================================= */ |
||||
static uint32_t crc32_combine_(uint32_t crc1, uint32_t crc2, z_off64_t len2) { |
||||
return multmodp(x2nmodp(len2, 3), crc1) ^ crc2; |
||||
} |
||||
static uint32_t crc32_combine_gen_(z_off64_t len2) { |
||||
return x2nmodp(len2, 3); |
||||
} |
||||
static uint32_t crc32_combine_op_(uint32_t crc1, uint32_t crc2, const uint32_t op) { |
||||
return multmodp(op, crc1) ^ crc2; |
||||
} |
||||
|
||||
/* ========================================================================= */ |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
unsigned long Z_EXPORT PREFIX(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off_t len2) { |
||||
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2); |
||||
} |
||||
unsigned long Z_EXPORT PREFIX4(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off64_t len2) { |
||||
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2); |
||||
} |
||||
unsigned long Z_EXPORT PREFIX(crc32_combine_gen)(z_off_t len2) { |
||||
return crc32_combine_gen_(len2); |
||||
} |
||||
unsigned long Z_EXPORT PREFIX4(crc32_combine_gen)(z_off64_t len2) { |
||||
return crc32_combine_gen_(len2); |
||||
} |
||||
unsigned long Z_EXPORT PREFIX(crc32_combine_op)(unsigned long crc1, unsigned long crc2, const unsigned long op) { |
||||
return (unsigned long)crc32_combine_op_((uint32_t)crc1, (uint32_t)crc2, (uint32_t)op); |
||||
} |
||||
#else |
||||
uint32_t Z_EXPORT PREFIX4(crc32_combine)(uint32_t crc1, uint32_t crc2, z_off64_t len2) { |
||||
return crc32_combine_(crc1, crc2, len2); |
||||
} |
||||
uint32_t Z_EXPORT PREFIX(crc32_combine_gen)(z_off64_t len2) { |
||||
return crc32_combine_gen_(len2); |
||||
} |
||||
uint32_t Z_EXPORT PREFIX(crc32_combine_op)(uint32_t crc1, uint32_t crc2, const uint32_t op) { |
||||
return crc32_combine_op_(crc1, crc2, op); |
||||
} |
||||
#endif |
||||
|
||||
/* ========================================================================= */ |
@ -0,0 +1,42 @@ |
||||
#ifndef CRC32_BRAID_COMB_P_H_ |
||||
#define CRC32_BRAID_COMB_P_H_ |
||||
|
||||
/*
|
||||
Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial, |
||||
reflected. For speed, this requires that a not be zero. |
||||
*/ |
||||
static uint32_t multmodp(uint32_t a, uint32_t b) { |
||||
uint32_t m, p; |
||||
|
||||
m = (uint32_t)1 << 31; |
||||
p = 0; |
||||
for (;;) { |
||||
if (a & m) { |
||||
p ^= b; |
||||
if ((a & (m - 1)) == 0) |
||||
break; |
||||
} |
||||
m >>= 1; |
||||
b = b & 1 ? (b >> 1) ^ POLY : b >> 1; |
||||
} |
||||
return p; |
||||
} |
||||
|
||||
/*
|
||||
Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been |
||||
initialized. |
||||
*/ |
||||
static uint32_t x2nmodp(z_off64_t n, unsigned k) { |
||||
uint32_t p; |
||||
|
||||
p = (uint32_t)1 << 31; /* x^0 == 1 */ |
||||
while (n) { |
||||
if (n & 1) |
||||
p = multmodp(x2n_table[k & 31], p); |
||||
n >>= 1; |
||||
k++; |
||||
} |
||||
return p; |
||||
} |
||||
|
||||
#endif /* CRC32_BRAID_COMB_P_H_ */ |
@ -0,0 +1,50 @@ |
||||
#ifndef CRC32_BRAID_P_H_ |
||||
#define CRC32_BRAID_P_H_ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zendian.h" |
||||
|
||||
/* Define N */ |
||||
#ifdef Z_TESTN |
||||
# define N Z_TESTN |
||||
#else |
||||
# define N 5 |
||||
#endif |
||||
#if N < 1 || N > 6 |
||||
# error N must be in 1..6 |
||||
#endif |
||||
|
||||
/*
|
||||
Define W and the associated z_word_t type. If W is not defined, then a |
||||
braided calculation is not used, and the associated tables and code are not |
||||
compiled. |
||||
*/ |
||||
#ifdef Z_TESTW |
||||
# if Z_TESTW-1 != -1 |
||||
# define W Z_TESTW |
||||
# endif |
||||
#else |
||||
# ifndef W |
||||
# if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) |
||||
# define W 8 |
||||
# else |
||||
# define W 4 |
||||
# endif |
||||
# endif |
||||
#endif |
||||
#ifdef W |
||||
# if W == 8 |
||||
typedef uint64_t z_word_t; |
||||
# else |
||||
# undef W |
||||
# define W 4 |
||||
typedef uint32_t z_word_t; |
||||
# endif |
||||
#endif |
||||
|
||||
/* CRC polynomial. */ |
||||
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */ |
||||
|
||||
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); |
||||
|
||||
#endif /* CRC32_BRAID_P_H_ */ |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,33 @@ |
||||
/* crc32_fold.c -- crc32 folding interface
|
||||
* Copyright (C) 2021 Nathan Moinvaziri |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#include "zbuild.h" |
||||
#include "functable.h" |
||||
|
||||
#include "crc32_fold.h" |
||||
|
||||
#include <limits.h> |
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) { |
||||
crc->value = CRC32_INITIAL_VALUE; |
||||
return crc->value; |
||||
} |
||||
|
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { |
||||
crc->value = functable.crc32(crc->value, src, len); |
||||
memcpy(dst, src, len); |
||||
} |
||||
|
||||
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { |
||||
/* Note: while this is basically the same thing as the vanilla CRC function, we still need
|
||||
* a functable entry for it so that we can generically dispatch to this function with the |
||||
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The |
||||
* init_crc is an unused argument in this context */ |
||||
Z_UNUSED(init_crc); |
||||
crc->value = functable.crc32(crc->value, src, len); |
||||
} |
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) { |
||||
return crc->value; |
||||
} |
@ -0,0 +1,21 @@ |
||||
/* crc32_fold.h -- crc32 folding interface
|
||||
* Copyright (C) 2021 Nathan Moinvaziri |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#ifndef CRC32_FOLD_H_ |
||||
#define CRC32_FOLD_H_ |
||||
|
||||
#define CRC32_FOLD_BUFFER_SIZE (16 * 4) |
||||
/* sizeof(__m128i) * (4 folds) */ |
||||
|
||||
typedef struct crc32_fold_s { |
||||
uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; |
||||
uint32_t value; |
||||
} crc32_fold; |
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); |
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); |
||||
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); |
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc); |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,408 @@ |
||||
#ifndef DEFLATE_H_ |
||||
#define DEFLATE_H_ |
||||
/* deflate.h -- internal compression state
|
||||
* Copyright (C) 1995-2016 Jean-loup Gailly |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
/* WARNING: this file should *not* be used by applications. It is
|
||||
part of the implementation of the compression library and is |
||||
subject to change. Applications should only use zlib.h. |
||||
*/ |
||||
|
||||
#include "zutil.h" |
||||
#include "zendian.h" |
||||
#include "adler32_fold.h" |
||||
#include "crc32_fold.h" |
||||
|
||||
/* define NO_GZIP when compiling if you want to disable gzip header and
|
||||
trailer creation by deflate(). NO_GZIP would be used to avoid linking in |
||||
the crc code when it is not needed. For shared libraries, gzip encoding |
||||
should be left enabled. */ |
||||
#ifndef NO_GZIP |
||||
# define GZIP |
||||
#endif |
||||
|
||||
/* ===========================================================================
|
||||
* Internal compression state. |
||||
*/ |
||||
|
||||
#define LENGTH_CODES 29 |
||||
/* number of length codes, not counting the special END_BLOCK code */ |
||||
|
||||
#define LITERALS 256 |
||||
/* number of literal bytes 0..255 */ |
||||
|
||||
#define L_CODES (LITERALS+1+LENGTH_CODES) |
||||
/* number of Literal or Length codes, including the END_BLOCK code */ |
||||
|
||||
#define D_CODES 30 |
||||
/* number of distance codes */ |
||||
|
||||
#define BL_CODES 19 |
||||
/* number of codes used to transfer the bit lengths */ |
||||
|
||||
#define HEAP_SIZE (2*L_CODES+1) |
||||
/* maximum heap size */ |
||||
|
||||
#define BIT_BUF_SIZE 64 |
||||
/* size of bit buffer in bi_buf */ |
||||
|
||||
#define END_BLOCK 256 |
||||
/* end of block literal code */ |
||||
|
||||
#define INIT_STATE 1 /* zlib header -> BUSY_STATE */ |
||||
#ifdef GZIP |
||||
# define GZIP_STATE 4 /* gzip header -> BUSY_STATE | EXTRA_STATE */ |
||||
# define EXTRA_STATE 5 /* gzip extra block -> NAME_STATE */ |
||||
# define NAME_STATE 6 /* gzip file name -> COMMENT_STATE */ |
||||
# define COMMENT_STATE 7 /* gzip comment -> HCRC_STATE */ |
||||
# define HCRC_STATE 8 /* gzip header CRC -> BUSY_STATE */ |
||||
#endif |
||||
#define BUSY_STATE 2 /* deflate -> FINISH_STATE */ |
||||
#define FINISH_STATE 3 /* stream complete */ |
||||
#ifdef GZIP |
||||
# define MAX_STATE HCRC_STATE |
||||
#else |
||||
# define MAX_STATE FINISH_STATE |
||||
#endif |
||||
/* Stream status */ |
||||
|
||||
#define HASH_BITS 16u /* log2(HASH_SIZE) */ |
||||
#ifndef HASH_SIZE |
||||
# define HASH_SIZE 65536u /* number of elements in hash table */ |
||||
#endif |
||||
#define HASH_MASK (HASH_SIZE - 1u) /* HASH_SIZE-1 */ |
||||
|
||||
|
||||
/* Data structure describing a single value and its code string. */ |
||||
typedef struct ct_data_s { |
||||
union { |
||||
uint16_t freq; /* frequency count */ |
||||
uint16_t code; /* bit string */ |
||||
} fc; |
||||
union { |
||||
uint16_t dad; /* father node in Huffman tree */ |
||||
uint16_t len; /* length of bit string */ |
||||
} dl; |
||||
} ct_data; |
||||
|
||||
#define Freq fc.freq |
||||
#define Code fc.code |
||||
#define Dad dl.dad |
||||
#define Len dl.len |
||||
|
||||
typedef struct static_tree_desc_s static_tree_desc; |
||||
|
||||
typedef struct tree_desc_s { |
||||
ct_data *dyn_tree; /* the dynamic tree */ |
||||
int max_code; /* largest code with non zero frequency */ |
||||
const static_tree_desc *stat_desc; /* the corresponding static tree */ |
||||
} tree_desc; |
||||
|
||||
typedef uint16_t Pos; |
||||
|
||||
/* A Pos is an index in the character window. We use short instead of int to
|
||||
* save space in the various tables. |
||||
*/ |
||||
/* Type definitions for hash callbacks */ |
||||
typedef struct internal_state deflate_state; |
||||
|
||||
typedef uint32_t (* update_hash_cb) (deflate_state *const s, uint32_t h, uint32_t val); |
||||
typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count); |
||||
typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str); |
||||
|
||||
struct internal_state { |
||||
PREFIX3(stream) *strm; /* pointer back to this zlib stream */ |
||||
unsigned char *pending_buf; /* output still pending */ |
||||
unsigned char *pending_out; /* next pending byte to output to the stream */ |
||||
uint32_t pending_buf_size; /* size of pending_buf */ |
||||
uint32_t pending; /* nb of bytes in the pending buffer */ |
||||
int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ |
||||
uint32_t gzindex; /* where in extra, name, or comment */ |
||||
PREFIX(gz_headerp) gzhead; /* gzip header information to write */ |
||||
int status; /* as the name implies */ |
||||
int last_flush; /* value of flush param for previous deflate call */ |
||||
int reproducible; /* Whether reproducible compression results are required. */ |
||||
|
||||
int block_open; |
||||
/* Whether or not a block is currently open for the QUICK deflation scheme.
|
||||
* This is set to 1 if there is an active block, or 0 if the block was just closed. |
||||
*/ |
||||
|
||||
/* used by deflate.c: */ |
||||
|
||||
unsigned int w_size; /* LZ77 window size (32K by default) */ |
||||
unsigned int w_bits; /* log2(w_size) (8..16) */ |
||||
unsigned int w_mask; /* w_size - 1 */ |
||||
unsigned int lookahead; /* number of valid bytes ahead in window */ |
||||
|
||||
unsigned int high_water; |
||||
/* High water mark offset in window for initialized bytes -- bytes above
|
||||
* this are set to zero in order to avoid memory check warnings when |
||||
* longest match routines access bytes past the input. This is then |
||||
* updated to the new high water mark. |
||||
*/ |
||||
|
||||
unsigned int window_size; |
||||
/* Actual size of window: 2*wSize, except when the user input buffer
|
||||
* is directly used as sliding window. |
||||
*/ |
||||
|
||||
unsigned char *window; |
||||
/* Sliding window. Input bytes are read into the second half of the window,
|
||||
* and move to the first half later to keep a dictionary of at least wSize |
||||
* bytes. With this organization, matches are limited to a distance of |
||||
* wSize-STD_MAX_MATCH bytes, but this ensures that IO is always |
||||
* performed with a length multiple of the block size. Also, it limits |
||||
* the window size to 64K, which is quite useful on MSDOS. |
||||
* To do: use the user input buffer as sliding window. |
||||
*/ |
||||
|
||||
Pos *prev; |
||||
/* Link to older string with same hash index. To limit the size of this
|
||||
* array to 64K, this link is maintained only for the last 32K strings. |
||||
* An index in this array is thus a window index modulo 32K. |
||||
*/ |
||||
|
||||
Pos *head; /* Heads of the hash chains or 0. */ |
||||
|
||||
uint32_t ins_h; /* hash index of string to be inserted */ |
||||
|
||||
int block_start; |
||||
/* Window position at the beginning of the current output block. Gets
|
||||
* negative when the window is moved backwards. |
||||
*/ |
||||
|
||||
unsigned int match_length; /* length of best match */ |
||||
Pos prev_match; /* previous match */ |
||||
int match_available; /* set if previous match exists */ |
||||
unsigned int strstart; /* start of string to insert */ |
||||
unsigned int match_start; /* start of matching string */ |
||||
|
||||
unsigned int prev_length; |
||||
/* Length of the best match at previous step. Matches not greater than this
|
||||
* are discarded. This is used in the lazy match evaluation. |
||||
*/ |
||||
|
||||
unsigned int max_chain_length; |
||||
/* To speed up deflation, hash chains are never searched beyond this length.
|
||||
* A higher limit improves compression ratio but degrades the speed. |
||||
*/ |
||||
|
||||
unsigned int max_lazy_match; |
||||
/* Attempt to find a better match only when the current match is strictly smaller
|
||||
* than this value. This mechanism is used only for compression levels >= 4. |
||||
*/ |
||||
# define max_insert_length max_lazy_match |
||||
/* Insert new strings in the hash table only if the match length is not
|
||||
* greater than this length. This saves time but degrades compression. |
||||
* max_insert_length is used only for compression levels <= 3. |
||||
*/ |
||||
|
||||
update_hash_cb update_hash; |
||||
insert_string_cb insert_string; |
||||
quick_insert_string_cb quick_insert_string; |
||||
/* Hash function callbacks that can be configured depending on the deflate
|
||||
* algorithm being used */ |
||||
|
||||
int level; /* compression level (1..9) */ |
||||
int strategy; /* favor or force Huffman coding*/ |
||||
|
||||
unsigned int good_match; |
||||
/* Use a faster search when the previous match is longer than this */ |
||||
|
||||
int nice_match; /* Stop searching when current match exceeds this */ |
||||
|
||||
struct crc32_fold_s ALIGNED_(16) crc_fold; |
||||
|
||||
/* used by trees.c: */ |
||||
/* Didn't use ct_data typedef below to suppress compiler warning */ |
||||
struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ |
||||
struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ |
||||
struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ |
||||
|
||||
struct tree_desc_s l_desc; /* desc. for literal tree */ |
||||
struct tree_desc_s d_desc; /* desc. for distance tree */ |
||||
struct tree_desc_s bl_desc; /* desc. for bit length tree */ |
||||
|
||||
uint16_t bl_count[MAX_BITS+1]; |
||||
/* number of codes at each bit length for an optimal tree */ |
||||
|
||||
int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ |
||||
int heap_len; /* number of elements in the heap */ |
||||
int heap_max; /* element of largest frequency */ |
||||
/* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
|
||||
* The same heap array is used to build all trees. |
||||
*/ |
||||
|
||||
unsigned char depth[2*L_CODES+1]; |
||||
/* Depth of each subtree used as tie breaker for trees of equal frequency
|
||||
*/ |
||||
|
||||
unsigned int lit_bufsize; |
||||
/* Size of match buffer for literals/lengths. There are 4 reasons for
|
||||
* limiting lit_bufsize to 64K: |
||||
* - frequencies can be kept in 16 bit counters |
||||
* - if compression is not successful for the first block, all input |
||||
* data is still in the window so we can still emit a stored block even |
||||
* when input comes from standard input. (This can also be done for |
||||
* all blocks if lit_bufsize is not greater than 32K.) |
||||
* - if compression is not successful for a file smaller than 64K, we can |
||||
* even emit a stored file instead of a stored block (saving 5 bytes). |
||||
* This is applicable only for zip (not gzip or zlib). |
||||
* - creating new Huffman trees less frequently may not provide fast |
||||
* adaptation to changes in the input data statistics. (Take for |
||||
* example a binary file with poorly compressible code followed by |
||||
* a highly compressible string table.) Smaller buffer sizes give |
||||
* fast adaptation but have of course the overhead of transmitting |
||||
* trees more frequently. |
||||
* - I can't count above 4 |
||||
*/ |
||||
|
||||
unsigned char *sym_buf; /* buffer for distances and literals/lengths */ |
||||
unsigned int sym_next; /* running index in sym_buf */ |
||||
unsigned int sym_end; /* symbol table full when sym_next reaches this */ |
||||
|
||||
unsigned long opt_len; /* bit length of current block with optimal trees */ |
||||
unsigned long static_len; /* bit length of current block with static trees */ |
||||
unsigned int matches; /* number of string matches in current block */ |
||||
unsigned int insert; /* bytes at end of window left to insert */ |
||||
|
||||
/* compressed_len and bits_sent are only used if ZLIB_DEBUG is defined */ |
||||
unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */ |
||||
unsigned long bits_sent; /* bit length of compressed data sent mod 2^32 */ |
||||
|
||||
/* Reserved for future use and alignment purposes */ |
||||
char *reserved_p; |
||||
|
||||
uint64_t bi_buf; |
||||
/* Output buffer. bits are inserted starting at the bottom (least significant bits). */ |
||||
|
||||
int32_t bi_valid; |
||||
/* Number of valid bits in bi_buf. All bits above the last valid bit are always zero. */ |
||||
|
||||
/* Reserved for future use and alignment purposes */ |
||||
int32_t reserved[11]; |
||||
} ALIGNED_(8); |
||||
|
||||
typedef enum { |
||||
need_more, /* block not completed, need more input or more output */ |
||||
block_done, /* block flush performed */ |
||||
finish_started, /* finish started, need only more output at next deflate */ |
||||
finish_done /* finish done, accept no more input or output */ |
||||
} block_state; |
||||
|
||||
/* Output a byte on the stream.
|
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
#define put_byte(s, c) { \ |
||||
s->pending_buf[s->pending++] = (unsigned char)(c); \
|
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Output a short LSB first on the stream. |
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
static inline void put_short(deflate_state *s, uint16_t w) { |
||||
#if BYTE_ORDER == BIG_ENDIAN |
||||
w = ZSWAP16(w); |
||||
#endif |
||||
memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); |
||||
s->pending += 2; |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Output a short MSB first on the stream. |
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
static inline void put_short_msb(deflate_state *s, uint16_t w) { |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
w = ZSWAP16(w); |
||||
#endif |
||||
memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); |
||||
s->pending += 2; |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Output a 32-bit unsigned int LSB first on the stream. |
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
static inline void put_uint32(deflate_state *s, uint32_t dw) { |
||||
#if BYTE_ORDER == BIG_ENDIAN |
||||
dw = ZSWAP32(dw); |
||||
#endif |
||||
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); |
||||
s->pending += 4; |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Output a 32-bit unsigned int MSB first on the stream. |
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
static inline void put_uint32_msb(deflate_state *s, uint32_t dw) { |
||||
#if BYTE_ORDER == LITTLE_ENDIAN |
||||
dw = ZSWAP32(dw); |
||||
#endif |
||||
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); |
||||
s->pending += 4; |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Output a 64-bit unsigned int LSB first on the stream. |
||||
* IN assertion: there is enough room in pending_buf. |
||||
*/ |
||||
static inline void put_uint64(deflate_state *s, uint64_t lld) { |
||||
#if BYTE_ORDER == BIG_ENDIAN |
||||
lld = ZSWAP64(lld); |
||||
#endif |
||||
memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld)); |
||||
s->pending += 8; |
||||
} |
||||
|
||||
#define MIN_LOOKAHEAD (STD_MAX_MATCH + STD_MIN_MATCH + 1) |
||||
/* Minimum amount of lookahead, except at the end of the input file.
|
||||
* See deflate.c for comments about the STD_MIN_MATCH+1. |
||||
*/ |
||||
|
||||
#define MAX_DIST(s) ((s)->w_size - MIN_LOOKAHEAD) |
||||
/* In order to simplify the code, particularly on 16 bit machines, match
|
||||
* distances are limited to MAX_DIST instead of WSIZE. |
||||
*/ |
||||
|
||||
#define WIN_INIT STD_MAX_MATCH |
||||
/* Number of bytes after end of data in window to initialize in order to avoid
|
||||
memory checker errors from longest match routines */ |
||||
|
||||
|
||||
void Z_INTERNAL PREFIX(fill_window)(deflate_state *s); |
||||
void Z_INTERNAL slide_hash_c(deflate_state *s); |
||||
|
||||
/* in trees.c */ |
||||
void Z_INTERNAL zng_tr_init(deflate_state *s); |
||||
void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last); |
||||
void Z_INTERNAL zng_tr_flush_bits(deflate_state *s); |
||||
void Z_INTERNAL zng_tr_align(deflate_state *s); |
||||
void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last); |
||||
uint16_t Z_INTERNAL PREFIX(bi_reverse)(unsigned code, int len); |
||||
void Z_INTERNAL PREFIX(flush_pending)(PREFIX3(streamp) strm); |
||||
#define d_code(dist) ((dist) < 256 ? zng_dist_code[dist] : zng_dist_code[256+((dist)>>7)]) |
||||
/* Mapping from a distance to a distance code. dist is the distance - 1 and
|
||||
* must not have side effects. zng_dist_code[256] and zng_dist_code[257] are never |
||||
* used. |
||||
*/ |
||||
|
||||
/* Bit buffer and compress bits calculation debugging */ |
||||
#ifdef ZLIB_DEBUG |
||||
# define cmpr_bits_add(s, len) s->compressed_len += (len) |
||||
# define cmpr_bits_align(s) s->compressed_len = (s->compressed_len + 7) & ~7L |
||||
# define sent_bits_add(s, bits) s->bits_sent += (bits) |
||||
# define sent_bits_align(s) s->bits_sent = (s->bits_sent + 7) & ~7L |
||||
#else |
||||
# define cmpr_bits_add(s, len) Z_UNUSED(len) |
||||
# define cmpr_bits_align(s) |
||||
# define sent_bits_add(s, bits) Z_UNUSED(bits) |
||||
# define sent_bits_align(s) |
||||
#endif |
||||
|
||||
#endif /* DEFLATE_H_ */ |
@ -0,0 +1,102 @@ |
||||
/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
/* ===========================================================================
|
||||
* Compress as much as possible from the input stream, return the current |
||||
* block state. |
||||
* This function does not perform lazy evaluation of matches and inserts |
||||
* new strings in the dictionary only for unmatched strings or for short |
||||
* matches. It is used only for the fast compression options. |
||||
*/ |
||||
Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) { |
||||
Pos hash_head; /* head of the hash chain */ |
||||
int bflush = 0; /* set if current block must be flushed */ |
||||
int64_t dist; |
||||
uint32_t match_len = 0; |
||||
|
||||
for (;;) { |
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes |
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the |
||||
* string following the next match. |
||||
*/ |
||||
if (s->lookahead < MIN_LOOKAHEAD) { |
||||
PREFIX(fill_window)(s); |
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) { |
||||
return need_more; |
||||
} |
||||
if (UNLIKELY(s->lookahead == 0)) |
||||
break; /* flush the current block */ |
||||
} |
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain: |
||||
*/ |
||||
if (s->lookahead >= WANT_MIN_MATCH) { |
||||
hash_head = functable.quick_insert_string(s, s->strstart); |
||||
dist = (int64_t)s->strstart - hash_head; |
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match length < WANT_MIN_MATCH |
||||
*/ |
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) { |
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match |
||||
* of the string with itself at the start of the input file). |
||||
*/ |
||||
match_len = functable.longest_match(s, hash_head); |
||||
/* longest_match() sets match_start */ |
||||
} |
||||
} |
||||
|
||||
if (match_len >= WANT_MIN_MATCH) { |
||||
check_match(s, s->strstart, s->match_start, match_len); |
||||
|
||||
bflush = zng_tr_tally_dist(s, s->strstart - s->match_start, match_len - STD_MIN_MATCH); |
||||
|
||||
s->lookahead -= match_len; |
||||
|
||||
/* Insert new strings in the hash table only if the match length
|
||||
* is not too large. This saves time but degrades compression. |
||||
*/ |
||||
if (match_len <= s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) { |
||||
match_len--; /* string at strstart already in table */ |
||||
s->strstart++; |
||||
|
||||
functable.insert_string(s, s->strstart, match_len); |
||||
s->strstart += match_len; |
||||
} else { |
||||
s->strstart += match_len; |
||||
functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH); |
||||
|
||||
/* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
|
||||
* matter since it will be recomputed at next deflate call. |
||||
*/ |
||||
} |
||||
match_len = 0; |
||||
} else { |
||||
/* No match, output a literal byte */ |
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]); |
||||
s->lookahead--; |
||||
s->strstart++; |
||||
} |
||||
if (UNLIKELY(bflush)) |
||||
FLUSH_BLOCK(s, 0); |
||||
} |
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1); |
||||
if (UNLIKELY(flush == Z_FINISH)) { |
||||
FLUSH_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
if (UNLIKELY(s->sym_next)) |
||||
FLUSH_BLOCK(s, 0); |
||||
return block_done; |
||||
} |
@ -0,0 +1,45 @@ |
||||
/* deflate_huff.c -- compress data using huffman encoding only strategy
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
/* ===========================================================================
|
||||
* For Z_HUFFMAN_ONLY, do not look for matches. Do not maintain a hash table. |
||||
* (It will be regenerated if this run of deflate switches away from Huffman.) |
||||
*/ |
||||
Z_INTERNAL block_state deflate_huff(deflate_state *s, int flush) { |
||||
int bflush = 0; /* set if current block must be flushed */ |
||||
|
||||
for (;;) { |
||||
/* Make sure that we have a literal to write. */ |
||||
if (s->lookahead == 0) { |
||||
PREFIX(fill_window)(s); |
||||
if (s->lookahead == 0) { |
||||
if (flush == Z_NO_FLUSH) |
||||
return need_more; |
||||
break; /* flush the current block */ |
||||
} |
||||
} |
||||
|
||||
/* Output a literal byte */ |
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]); |
||||
s->lookahead--; |
||||
s->strstart++; |
||||
if (bflush) |
||||
FLUSH_BLOCK(s, 0); |
||||
} |
||||
s->insert = 0; |
||||
if (flush == Z_FINISH) { |
||||
FLUSH_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
if (s->sym_next) |
||||
FLUSH_BLOCK(s, 0); |
||||
return block_done; |
||||
} |
@ -0,0 +1,293 @@ |
||||
/* deflate_medium.c -- The deflate_medium deflate strategy
|
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Authors: |
||||
* Arjan van de Ven <arjan@linux.intel.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
#ifndef NO_MEDIUM_STRATEGY |
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
struct match { |
||||
uint16_t match_start; |
||||
uint16_t match_length; |
||||
uint16_t strstart; |
||||
uint16_t orgstart; |
||||
}; |
||||
|
||||
static int emit_match(deflate_state *s, struct match match) { |
||||
int bflush = 0; |
||||
|
||||
/* matches that are not long enough we need to emit as literals */ |
||||
if (match.match_length < WANT_MIN_MATCH) { |
||||
while (match.match_length) { |
||||
bflush += zng_tr_tally_lit(s, s->window[match.strstart]); |
||||
s->lookahead--; |
||||
match.strstart++; |
||||
match.match_length--; |
||||
} |
||||
return bflush; |
||||
} |
||||
|
||||
check_match(s, match.strstart, match.match_start, match.match_length); |
||||
|
||||
bflush += zng_tr_tally_dist(s, match.strstart - match.match_start, match.match_length - STD_MIN_MATCH); |
||||
|
||||
s->lookahead -= match.match_length; |
||||
return bflush; |
||||
} |
||||
|
||||
static void insert_match(deflate_state *s, struct match match) { |
||||
if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH))) |
||||
return; |
||||
|
||||
/* matches that are not long enough we need to emit as literals */ |
||||
if (LIKELY(match.match_length < WANT_MIN_MATCH)) { |
||||
match.strstart++; |
||||
match.match_length--; |
||||
if (UNLIKELY(match.match_length > 0)) { |
||||
if (match.strstart >= match.orgstart) { |
||||
if (match.strstart + match.match_length - 1 >= match.orgstart) { |
||||
functable.insert_string(s, match.strstart, match.match_length); |
||||
} else { |
||||
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); |
||||
} |
||||
match.strstart += match.match_length; |
||||
match.match_length = 0; |
||||
} |
||||
} |
||||
return; |
||||
} |
||||
|
||||
/* Insert new strings in the hash table only if the match length
|
||||
* is not too large. This saves time but degrades compression. |
||||
*/ |
||||
if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) { |
||||
match.match_length--; /* string at strstart already in table */ |
||||
match.strstart++; |
||||
|
||||
if (LIKELY(match.strstart >= match.orgstart)) { |
||||
if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) { |
||||
functable.insert_string(s, match.strstart, match.match_length); |
||||
} else { |
||||
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); |
||||
} |
||||
} else if (match.orgstart < match.strstart + match.match_length) { |
||||
functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart); |
||||
} |
||||
match.strstart += match.match_length; |
||||
match.match_length = 0; |
||||
} else { |
||||
match.strstart += match.match_length; |
||||
match.match_length = 0; |
||||
|
||||
if (match.strstart >= (STD_MIN_MATCH - 2)) |
||||
functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH); |
||||
|
||||
/* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
|
||||
* matter since it will be recomputed at next deflate call. |
||||
*/ |
||||
} |
||||
} |
||||
|
||||
static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) { |
||||
Pos limit; |
||||
unsigned char *match, *orig; |
||||
int changed = 0; |
||||
struct match c, n; |
||||
/* step zero: sanity checks */ |
||||
|
||||
if (current->match_length <= 1) |
||||
return; |
||||
|
||||
if (UNLIKELY(current->match_length > 1 + next->match_start)) |
||||
return; |
||||
|
||||
if (UNLIKELY(current->match_length > 1 + next->strstart)) |
||||
return; |
||||
|
||||
match = s->window - current->match_length + 1 + next->match_start; |
||||
orig = s->window - current->match_length + 1 + next->strstart; |
||||
|
||||
/* quick exit check.. if this fails then don't bother with anything else */ |
||||
if (LIKELY(*match != *orig)) |
||||
return; |
||||
|
||||
c = *current; |
||||
n = *next; |
||||
|
||||
/* step one: try to move the "next" match to the left as much as possible */ |
||||
limit = next->strstart > MAX_DIST(s) ? next->strstart - (Pos)MAX_DIST(s) : 0; |
||||
|
||||
match = s->window + n.match_start - 1; |
||||
orig = s->window + n.strstart - 1; |
||||
|
||||
while (*match == *orig) { |
||||
if (UNLIKELY(c.match_length < 1)) |
||||
break; |
||||
if (UNLIKELY(n.strstart <= limit)) |
||||
break; |
||||
if (UNLIKELY(n.match_length >= 256)) |
||||
break; |
||||
if (UNLIKELY(n.match_start <= 1)) |
||||
break; |
||||
|
||||
n.strstart--; |
||||
n.match_start--; |
||||
n.match_length++; |
||||
c.match_length--; |
||||
match--; |
||||
orig--; |
||||
changed++; |
||||
} |
||||
|
||||
if (!changed) |
||||
return; |
||||
|
||||
if (c.match_length <= 1 && n.match_length != 2) { |
||||
n.orgstart++; |
||||
*current = c; |
||||
*next = n; |
||||
} else { |
||||
return; |
||||
} |
||||
} |
||||
|
||||
Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { |
||||
/* Align the first struct to start on a new cacheline, this allows us to fit both structs in one cacheline */ |
||||
ALIGNED_(16) struct match current_match; |
||||
struct match next_match; |
||||
|
||||
/* For levels below 5, don't check the next position for a better match */ |
||||
int early_exit = s->level < 5; |
||||
|
||||
memset(¤t_match, 0, sizeof(struct match)); |
||||
memset(&next_match, 0, sizeof(struct match)); |
||||
|
||||
for (;;) { |
||||
Pos hash_head = 0; /* head of the hash chain */ |
||||
int bflush = 0; /* set if current block must be flushed */ |
||||
int64_t dist; |
||||
|
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes |
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the |
||||
* string following the next current_match. |
||||
*/ |
||||
if (s->lookahead < MIN_LOOKAHEAD) { |
||||
PREFIX(fill_window)(s); |
||||
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { |
||||
return need_more; |
||||
} |
||||
if (UNLIKELY(s->lookahead == 0)) |
||||
break; /* flush the current block */ |
||||
next_match.match_length = 0; |
||||
} |
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain: |
||||
*/ |
||||
|
||||
/* If we already have a future match from a previous round, just use that */ |
||||
if (!early_exit && next_match.match_length > 0) { |
||||
current_match = next_match; |
||||
next_match.match_length = 0; |
||||
} else { |
||||
hash_head = 0; |
||||
if (s->lookahead >= WANT_MIN_MATCH) { |
||||
hash_head = functable.quick_insert_string(s, s->strstart); |
||||
} |
||||
|
||||
current_match.strstart = (uint16_t)s->strstart; |
||||
current_match.orgstart = current_match.strstart; |
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match_length < WANT_MIN_MATCH |
||||
*/ |
||||
|
||||
dist = (int64_t)s->strstart - hash_head; |
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) { |
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match |
||||
* of the string with itself at the start of the input file). |
||||
*/ |
||||
current_match.match_length = (uint16_t)functable.longest_match(s, hash_head); |
||||
current_match.match_start = (uint16_t)s->match_start; |
||||
if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH)) |
||||
current_match.match_length = 1; |
||||
if (UNLIKELY(current_match.match_start >= current_match.strstart)) { |
||||
/* this can happen due to some restarts */ |
||||
current_match.match_length = 1; |
||||
} |
||||
} else { |
||||
/* Set up the match to be a 1 byte literal */ |
||||
current_match.match_start = 0; |
||||
current_match.match_length = 1; |
||||
} |
||||
} |
||||
|
||||
insert_match(s, current_match); |
||||
|
||||
/* now, look ahead one */ |
||||
if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) { |
||||
s->strstart = current_match.strstart + current_match.match_length; |
||||
hash_head = functable.quick_insert_string(s, s->strstart); |
||||
|
||||
next_match.strstart = (uint16_t)s->strstart; |
||||
next_match.orgstart = next_match.strstart; |
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match_length < WANT_MIN_MATCH |
||||
*/ |
||||
|
||||
dist = (int64_t)s->strstart - hash_head; |
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) { |
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match |
||||
* of the string with itself at the start of the input file). |
||||
*/ |
||||
next_match.match_length = (uint16_t)functable.longest_match(s, hash_head); |
||||
next_match.match_start = (uint16_t)s->match_start; |
||||
if (UNLIKELY(next_match.match_start >= next_match.strstart)) { |
||||
/* this can happen due to some restarts */ |
||||
next_match.match_length = 1; |
||||
} |
||||
if (next_match.match_length < WANT_MIN_MATCH) |
||||
next_match.match_length = 1; |
||||
else |
||||
fizzle_matches(s, ¤t_match, &next_match); |
||||
} else { |
||||
/* Set up the match to be a 1 byte literal */ |
||||
next_match.match_start = 0; |
||||
next_match.match_length = 1; |
||||
} |
||||
|
||||
s->strstart = current_match.strstart; |
||||
} else { |
||||
next_match.match_length = 0; |
||||
} |
||||
|
||||
/* now emit the current match */ |
||||
bflush = emit_match(s, current_match); |
||||
|
||||
/* move the "cursor" forward */ |
||||
s->strstart += current_match.match_length; |
||||
|
||||
if (UNLIKELY(bflush)) |
||||
FLUSH_BLOCK(s, 0); |
||||
} |
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1); |
||||
if (flush == Z_FINISH) { |
||||
FLUSH_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
if (UNLIKELY(s->sym_next)) |
||||
FLUSH_BLOCK(s, 0); |
||||
|
||||
return block_done; |
||||
} |
||||
#endif |
@ -0,0 +1,116 @@ |
||||
/* deflate_p.h -- Private inline functions and macros shared with more than
|
||||
* one deflate method |
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
* |
||||
*/ |
||||
|
||||
#ifndef DEFLATE_P_H |
||||
#define DEFLATE_P_H |
||||
|
||||
/* Forward declare common non-inlined functions declared in deflate.c */ |
||||
|
||||
#ifdef ZLIB_DEBUG |
||||
/* ===========================================================================
|
||||
* Check that the match at match_start is indeed a match. |
||||
*/ |
||||
static inline void check_match(deflate_state *s, Pos start, Pos match, int length) { |
||||
/* check that the match length is valid*/ |
||||
if (length < STD_MIN_MATCH || length > STD_MAX_MATCH) { |
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); |
||||
z_error("invalid match length"); |
||||
} |
||||
/* check that the match isn't at the same position as the start string */ |
||||
if (match == start) { |
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); |
||||
z_error("invalid match position"); |
||||
} |
||||
/* check that the match is indeed a match */ |
||||
if (memcmp(s->window + match, s->window + start, length) != 0) { |
||||
int32_t i = 0; |
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); |
||||
do { |
||||
fprintf(stderr, " %03d: match [%02x] start [%02x]\n", i++, |
||||
s->window[match++], s->window[start++]); |
||||
} while (--length != 0); |
||||
z_error("invalid match"); |
||||
} |
||||
if (z_verbose > 1) { |
||||
fprintf(stderr, "\\[%u,%d]", start-match, length); |
||||
do { |
||||
putc(s->window[start++], stderr); |
||||
} while (--length != 0); |
||||
} |
||||
} |
||||
#else |
||||
#define check_match(s, start, match, length) |
||||
#endif |
||||
|
||||
Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm); |
||||
Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size); |
||||
|
||||
/* ===========================================================================
|
||||
* Save the match info and tally the frequency counts. Return true if |
||||
* the current block must be flushed. |
||||
*/ |
||||
|
||||
extern const unsigned char Z_INTERNAL zng_length_code[]; |
||||
extern const unsigned char Z_INTERNAL zng_dist_code[]; |
||||
|
||||
static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) { |
||||
/* c is the unmatched char */ |
||||
s->sym_buf[s->sym_next++] = 0; |
||||
s->sym_buf[s->sym_next++] = 0; |
||||
s->sym_buf[s->sym_next++] = c; |
||||
s->dyn_ltree[c].Freq++; |
||||
Tracevv((stderr, "%c", c)); |
||||
Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal"); |
||||
return (s->sym_next == s->sym_end); |
||||
} |
||||
|
||||
static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) { |
||||
/* dist: distance of matched string */ |
||||
/* len: match length-STD_MIN_MATCH */ |
||||
s->sym_buf[s->sym_next++] = (uint8_t)(dist); |
||||
s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8); |
||||
s->sym_buf[s->sym_next++] = (uint8_t)len; |
||||
s->matches++; |
||||
dist--; |
||||
Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES, |
||||
"zng_tr_tally: bad match"); |
||||
|
||||
s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++; |
||||
s->dyn_dtree[d_code(dist)].Freq++; |
||||
return (s->sym_next == s->sym_end); |
||||
} |
||||
|
||||
/* ===========================================================================
|
||||
* Flush the current block, with given end-of-file flag. |
||||
* IN assertion: strstart is set to the end of the current match. |
||||
*/ |
||||
#define FLUSH_BLOCK_ONLY(s, last) { \ |
||||
zng_tr_flush_block(s, (s->block_start >= 0 ? \
|
||||
(char *)&s->window[(unsigned)s->block_start] : \
|
||||
NULL), \
|
||||
(uint32_t)((int)s->strstart - s->block_start), \
|
||||
(last)); \
|
||||
s->block_start = (int)s->strstart; \
|
||||
PREFIX(flush_pending)(s->strm); \
|
||||
} |
||||
|
||||
/* Same but force premature exit if necessary. */ |
||||
#define FLUSH_BLOCK(s, last) { \ |
||||
FLUSH_BLOCK_ONLY(s, last); \
|
||||
if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
|
||||
} |
||||
|
||||
/* Maximum stored block length in deflate format (not including header). */ |
||||
#define MAX_STORED 65535 |
||||
|
||||
/* Compression function. Returns the block state after the call. */ |
||||
typedef block_state (*compress_func) (deflate_state *s, int flush); |
||||
/* Match function. Returns the longest match. */ |
||||
typedef uint32_t (*match_func) (deflate_state *const s, Pos cur_match); |
||||
|
||||
#endif |
@ -0,0 +1,129 @@ |
||||
/*
|
||||
* The deflate_quick deflate strategy, designed to be used when cycles are |
||||
* at a premium. |
||||
* |
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved. |
||||
* Authors: |
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com> |
||||
* Jim Guilford <james.guilford@intel.com> |
||||
* Vinodh Gopal <vinodh.gopal@intel.com> |
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com> |
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com> |
||||
* |
||||
* Portions are Copyright (C) 2016 12Sided Technology, LLC. |
||||
* Author: |
||||
* Phil Vachon <pvachon@12sidedtech.com> |
||||
* |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil_p.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
#include "trees_emit.h" |
||||
|
||||
extern const ct_data static_ltree[L_CODES+2]; |
||||
extern const ct_data static_dtree[D_CODES]; |
||||
|
||||
#define QUICK_START_BLOCK(s, last) { \ |
||||
zng_tr_emit_tree(s, STATIC_TREES, last); \
|
||||
s->block_open = 1 + (int)last; \
|
||||
s->block_start = (int)s->strstart; \
|
||||
} |
||||
|
||||
#define QUICK_END_BLOCK(s, last) { \ |
||||
if (s->block_open) { \
|
||||
zng_tr_emit_end_block(s, static_ltree, last); \
|
||||
s->block_open = 0; \
|
||||
s->block_start = (int)s->strstart; \
|
||||
PREFIX(flush_pending)(s->strm); \
|
||||
if (s->strm->avail_out == 0) \
|
||||
return (last) ? finish_started : need_more; \
|
||||
} \
|
||||
} |
||||
|
||||
Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { |
||||
Pos hash_head; |
||||
int64_t dist; |
||||
unsigned match_len, last; |
||||
|
||||
|
||||
last = (flush == Z_FINISH) ? 1 : 0; |
||||
if (UNLIKELY(last && s->block_open != 2)) { |
||||
/* Emit end of previous block */ |
||||
QUICK_END_BLOCK(s, 0); |
||||
/* Emit start of last block */ |
||||
QUICK_START_BLOCK(s, last); |
||||
} else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) { |
||||
/* Start new block only when we have lookahead data, so that if no
|
||||
input data is given an empty block will not be written */ |
||||
QUICK_START_BLOCK(s, last); |
||||
} |
||||
|
||||
for (;;) { |
||||
if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) { |
||||
PREFIX(flush_pending)(s->strm); |
||||
if (s->strm->avail_out == 0) { |
||||
return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more; |
||||
} |
||||
} |
||||
|
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) { |
||||
PREFIX(fill_window)(s); |
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) { |
||||
return need_more; |
||||
} |
||||
if (UNLIKELY(s->lookahead == 0)) |
||||
break; |
||||
|
||||
if (UNLIKELY(s->block_open == 0)) { |
||||
/* Start new block when we have lookahead data, so that if no
|
||||
input data is given an empty block will not be written */ |
||||
QUICK_START_BLOCK(s, last); |
||||
} |
||||
} |
||||
|
||||
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) { |
||||
hash_head = functable.quick_insert_string(s, s->strstart); |
||||
dist = (int64_t)s->strstart - hash_head; |
||||
|
||||
if (dist <= MAX_DIST(s) && dist > 0) { |
||||
const uint8_t *str_start = s->window + s->strstart; |
||||
const uint8_t *match_start = s->window + hash_head; |
||||
|
||||
if (zng_memcmp_2(str_start, match_start) == 0) { |
||||
match_len = functable.compare256(str_start+2, match_start+2) + 2; |
||||
|
||||
if (match_len >= WANT_MIN_MATCH) { |
||||
if (UNLIKELY(match_len > s->lookahead)) |
||||
match_len = s->lookahead; |
||||
if (UNLIKELY(match_len > STD_MAX_MATCH)) |
||||
match_len = STD_MAX_MATCH; |
||||
|
||||
check_match(s, s->strstart, hash_head, match_len); |
||||
|
||||
zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist); |
||||
s->lookahead -= match_len; |
||||
s->strstart += match_len; |
||||
continue; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
zng_tr_emit_lit(s, static_ltree, s->window[s->strstart]); |
||||
s->strstart++; |
||||
s->lookahead--; |
||||
} |
||||
|
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1); |
||||
if (UNLIKELY(last)) { |
||||
QUICK_END_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
|
||||
QUICK_END_BLOCK(s, 0); |
||||
return block_done; |
||||
} |
@ -0,0 +1,85 @@ |
||||
/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "compare256_rle.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
#ifdef UNALIGNED_OK |
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
# define compare256_rle compare256_rle_unaligned_64 |
||||
# elif defined(HAVE_BUILTIN_CTZ) |
||||
# define compare256_rle compare256_rle_unaligned_32 |
||||
# else |
||||
# define compare256_rle compare256_rle_unaligned_16 |
||||
# endif |
||||
#else |
||||
# define compare256_rle compare256_rle_c |
||||
#endif |
||||
|
||||
/* ===========================================================================
|
||||
* For Z_RLE, simply look for runs of bytes, generate matches only of distance |
||||
* one. Do not maintain a hash table. (It will be regenerated if this run of |
||||
* deflate switches away from Z_RLE.) |
||||
*/ |
||||
Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) { |
||||
int bflush = 0; /* set if current block must be flushed */ |
||||
unsigned char *scan; /* scan goes up to strend for length of run */ |
||||
uint32_t match_len = 0; |
||||
|
||||
for (;;) { |
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes |
||||
* for the longest run, plus one for the unrolled loop. |
||||
*/ |
||||
if (s->lookahead <= STD_MAX_MATCH) { |
||||
PREFIX(fill_window)(s); |
||||
if (s->lookahead <= STD_MAX_MATCH && flush == Z_NO_FLUSH) |
||||
return need_more; |
||||
if (s->lookahead == 0) |
||||
break; /* flush the current block */ |
||||
} |
||||
|
||||
/* See how many times the previous byte repeats */ |
||||
if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) { |
||||
scan = s->window + s->strstart - 1; |
||||
if (scan[0] == scan[1] && scan[1] == scan[2]) { |
||||
match_len = compare256_rle(scan, scan+3)+2; |
||||
match_len = MIN(match_len, s->lookahead); |
||||
match_len = MIN(match_len, STD_MAX_MATCH); |
||||
} |
||||
Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan"); |
||||
} |
||||
|
||||
/* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */ |
||||
if (match_len >= STD_MIN_MATCH) { |
||||
check_match(s, s->strstart, s->strstart - 1, match_len); |
||||
|
||||
bflush = zng_tr_tally_dist(s, 1, match_len - STD_MIN_MATCH); |
||||
|
||||
s->lookahead -= match_len; |
||||
s->strstart += match_len; |
||||
match_len = 0; |
||||
} else { |
||||
/* No match, output a literal byte */ |
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]); |
||||
s->lookahead--; |
||||
s->strstart++; |
||||
} |
||||
if (bflush) |
||||
FLUSH_BLOCK(s, 0); |
||||
} |
||||
s->insert = 0; |
||||
if (flush == Z_FINISH) { |
||||
FLUSH_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
if (s->sym_next) |
||||
FLUSH_BLOCK(s, 0); |
||||
return block_done; |
||||
} |
@ -0,0 +1,143 @@ |
||||
/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
/* ===========================================================================
|
||||
* Same as deflate_medium, but achieves better compression. We use a lazy |
||||
* evaluation for matches: a match is finally adopted only if there is |
||||
* no better match at the next window position. |
||||
*/ |
||||
Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) { |
||||
Pos hash_head; /* head of hash chain */ |
||||
int bflush; /* set if current block must be flushed */ |
||||
int64_t dist; |
||||
uint32_t match_len; |
||||
match_func *longest_match; |
||||
|
||||
if (s->max_chain_length <= 1024) |
||||
longest_match = &functable.longest_match; |
||||
else |
||||
longest_match = &functable.longest_match_slow; |
||||
|
||||
/* Process the input block. */ |
||||
for (;;) { |
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes |
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the |
||||
* string following the next match. |
||||
*/ |
||||
if (s->lookahead < MIN_LOOKAHEAD) { |
||||
PREFIX(fill_window)(s); |
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) { |
||||
return need_more; |
||||
} |
||||
if (UNLIKELY(s->lookahead == 0)) |
||||
break; /* flush the current block */ |
||||
} |
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain: |
||||
*/ |
||||
hash_head = 0; |
||||
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) { |
||||
hash_head = s->quick_insert_string(s, s->strstart); |
||||
} |
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
*/ |
||||
s->prev_match = (Pos)s->match_start; |
||||
match_len = STD_MIN_MATCH - 1; |
||||
dist = (int64_t)s->strstart - hash_head; |
||||
|
||||
if (dist <= MAX_DIST(s) && dist > 0 && s->prev_length < s->max_lazy_match && hash_head != 0) { |
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match |
||||
* of the string with itself at the start of the input file). |
||||
*/ |
||||
match_len = (*longest_match)(s, hash_head); |
||||
/* longest_match() sets match_start */ |
||||
|
||||
if (match_len <= 5 && (s->strategy == Z_FILTERED)) { |
||||
/* If prev_match is also WANT_MIN_MATCH, match_start is garbage
|
||||
* but we will ignore the current match anyway. |
||||
*/ |
||||
match_len = STD_MIN_MATCH - 1; |
||||
} |
||||
} |
||||
/* If there was a match at the previous step and the current
|
||||
* match is not better, output the previous match: |
||||
*/ |
||||
if (s->prev_length >= STD_MIN_MATCH && match_len <= s->prev_length) { |
||||
unsigned int max_insert = s->strstart + s->lookahead - STD_MIN_MATCH; |
||||
/* Do not insert strings in hash table beyond this. */ |
||||
|
||||
check_match(s, s->strstart-1, s->prev_match, s->prev_length); |
||||
|
||||
bflush = zng_tr_tally_dist(s, s->strstart -1 - s->prev_match, s->prev_length - STD_MIN_MATCH); |
||||
|
||||
/* Insert in hash table all strings up to the end of the match.
|
||||
* strstart-1 and strstart are already inserted. If there is not |
||||
* enough lookahead, the last two strings are not inserted in |
||||
* the hash table. |
||||
*/ |
||||
s->prev_length -= 1; |
||||
s->lookahead -= s->prev_length; |
||||
|
||||
unsigned int mov_fwd = s->prev_length - 1; |
||||
if (max_insert > s->strstart) { |
||||
unsigned int insert_cnt = mov_fwd; |
||||
if (UNLIKELY(insert_cnt > max_insert - s->strstart)) |
||||
insert_cnt = max_insert - s->strstart; |
||||
s->insert_string(s, s->strstart + 1, insert_cnt); |
||||
} |
||||
s->prev_length = 0; |
||||
s->match_available = 0; |
||||
s->strstart += mov_fwd + 1; |
||||
|
||||
if (UNLIKELY(bflush)) |
||||
FLUSH_BLOCK(s, 0); |
||||
|
||||
} else if (s->match_available) { |
||||
/* If there was no match at the previous position, output a
|
||||
* single literal. If there was a match but the current match |
||||
* is longer, truncate the previous match to a single literal. |
||||
*/ |
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart-1]); |
||||
if (UNLIKELY(bflush)) |
||||
FLUSH_BLOCK_ONLY(s, 0); |
||||
s->prev_length = match_len; |
||||
s->strstart++; |
||||
s->lookahead--; |
||||
if (UNLIKELY(s->strm->avail_out == 0)) |
||||
return need_more; |
||||
} else { |
||||
/* There is no previous match to compare with, wait for
|
||||
* the next step to decide. |
||||
*/ |
||||
s->prev_length = match_len; |
||||
s->match_available = 1; |
||||
s->strstart++; |
||||
s->lookahead--; |
||||
} |
||||
} |
||||
Assert(flush != Z_NO_FLUSH, "no flush?"); |
||||
if (UNLIKELY(s->match_available)) { |
||||
(void) zng_tr_tally_lit(s, s->window[s->strstart-1]); |
||||
s->match_available = 0; |
||||
} |
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1); |
||||
if (UNLIKELY(flush == Z_FINISH)) { |
||||
FLUSH_BLOCK(s, 1); |
||||
return finish_done; |
||||
} |
||||
if (UNLIKELY(s->sym_next)) |
||||
FLUSH_BLOCK(s, 0); |
||||
return block_done; |
||||
} |
@ -0,0 +1,186 @@ |
||||
/* deflate_stored.c -- store data without compression using deflation algorithm
|
||||
* |
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
|
||||
/* ===========================================================================
|
||||
* Copy without compression as much as possible from the input stream, return |
||||
* the current block state. |
||||
* |
||||
* In case deflateParams() is used to later switch to a non-zero compression |
||||
* level, s->matches (otherwise unused when storing) keeps track of the number |
||||
* of hash table slides to perform. If s->matches is 1, then one hash table |
||||
* slide will be done when switching. If s->matches is 2, the maximum value |
||||
* allowed here, then the hash table will be cleared, since two or more slides |
||||
* is the same as a clear. |
||||
* |
||||
* deflate_stored() is written to minimize the number of times an input byte is |
||||
* copied. It is most efficient with large input and output buffers, which |
||||
* maximizes the opportunites to have a single copy from next_in to next_out. |
||||
*/ |
||||
Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) { |
||||
/* Smallest worthy block size when not flushing or finishing. By default
|
||||
* this is 32K. This can be as small as 507 bytes for memLevel == 1. For |
||||
* large input and output buffers, the stored block size will be larger. |
||||
*/ |
||||
unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size); |
||||
|
||||
/* Copy as many min_block or larger stored blocks directly to next_out as
|
||||
* possible. If flushing, copy the remaining available input to next_out as |
||||
* stored blocks, if there is enough space. |
||||
*/ |
||||
unsigned len, left, have, last = 0; |
||||
unsigned used = s->strm->avail_in; |
||||
do { |
||||
/* Set len to the maximum size block that we can copy directly with the
|
||||
* available input data and output space. Set left to how much of that |
||||
* would be copied from what's left in the window. |
||||
*/ |
||||
len = MAX_STORED; /* maximum deflate stored block length */ |
||||
have = (s->bi_valid + 42) >> 3; /* number of header bytes */ |
||||
if (s->strm->avail_out < have) /* need room for header */ |
||||
break; |
||||
/* maximum stored block length that will fit in avail_out: */ |
||||
have = s->strm->avail_out - have; |
||||
left = (int)s->strstart - s->block_start; /* bytes left in window */ |
||||
if (len > (unsigned long)left + s->strm->avail_in) |
||||
len = left + s->strm->avail_in; /* limit len to the input */ |
||||
len = MIN(len, have); /* limit len to the output */ |
||||
|
||||
/* If the stored block would be less than min_block in length, or if
|
||||
* unable to copy all of the available input when flushing, then try |
||||
* copying to the window and the pending buffer instead. Also don't |
||||
* write an empty block when flushing -- deflate() does that. |
||||
*/ |
||||
if (len < min_block && ((len == 0 && flush != Z_FINISH) || flush == Z_NO_FLUSH || len != left + s->strm->avail_in)) |
||||
break; |
||||
|
||||
/* Make a dummy stored block in pending to get the header bytes,
|
||||
* including any pending bits. This also updates the debugging counts. |
||||
*/ |
||||
last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0; |
||||
zng_tr_stored_block(s, (char *)0, 0L, last); |
||||
|
||||
/* Replace the lengths in the dummy stored block with len. */ |
||||
s->pending -= 4; |
||||
put_short(s, (uint16_t)len); |
||||
put_short(s, (uint16_t)~len); |
||||
|
||||
/* Write the stored block header bytes. */ |
||||
PREFIX(flush_pending)(s->strm); |
||||
|
||||
/* Update debugging counts for the data about to be copied. */ |
||||
cmpr_bits_add(s, len << 3); |
||||
sent_bits_add(s, len << 3); |
||||
|
||||
/* Copy uncompressed bytes from the window to next_out. */ |
||||
if (left) { |
||||
left = MIN(left, len); |
||||
memcpy(s->strm->next_out, s->window + s->block_start, left); |
||||
s->strm->next_out += left; |
||||
s->strm->avail_out -= left; |
||||
s->strm->total_out += left; |
||||
s->block_start += (int)left; |
||||
len -= left; |
||||
} |
||||
|
||||
/* Copy uncompressed bytes directly from next_in to next_out, updating
|
||||
* the check value. |
||||
*/ |
||||
if (len) { |
||||
PREFIX(read_buf)(s->strm, s->strm->next_out, len); |
||||
s->strm->next_out += len; |
||||
s->strm->avail_out -= len; |
||||
s->strm->total_out += len; |
||||
} |
||||
} while (last == 0); |
||||
|
||||
/* Update the sliding window with the last s->w_size bytes of the copied
|
||||
* data, or append all of the copied data to the existing window if less |
||||
* than s->w_size bytes were copied. Also update the number of bytes to |
||||
* insert in the hash tables, in the event that deflateParams() switches to |
||||
* a non-zero compression level. |
||||
*/ |
||||
used -= s->strm->avail_in; /* number of input bytes directly copied */ |
||||
if (used) { |
||||
/* If any input was used, then no unused input remains in the window,
|
||||
* therefore s->block_start == s->strstart. |
||||
*/ |
||||
if (used >= s->w_size) { /* supplant the previous history */ |
||||
s->matches = 2; /* clear hash */ |
||||
memcpy(s->window, s->strm->next_in - s->w_size, s->w_size); |
||||
s->strstart = s->w_size; |
||||
s->insert = s->strstart; |
||||
} else { |
||||
if (s->window_size - s->strstart <= used) { |
||||
/* Slide the window down. */ |
||||
s->strstart -= s->w_size; |
||||
memcpy(s->window, s->window + s->w_size, s->strstart); |
||||
if (s->matches < 2) |
||||
s->matches++; /* add a pending slide_hash() */ |
||||
s->insert = MIN(s->insert, s->strstart); |
||||
} |
||||
memcpy(s->window + s->strstart, s->strm->next_in - used, used); |
||||
s->strstart += used; |
||||
s->insert += MIN(used, s->w_size - s->insert); |
||||
} |
||||
s->block_start = (int)s->strstart; |
||||
} |
||||
s->high_water = MAX(s->high_water, s->strstart); |
||||
|
||||
/* If the last block was written to next_out, then done. */ |
||||
if (last) |
||||
return finish_done; |
||||
|
||||
/* If flushing and all input has been consumed, then done. */ |
||||
if (flush != Z_NO_FLUSH && flush != Z_FINISH && s->strm->avail_in == 0 && (int)s->strstart == s->block_start) |
||||
return block_done; |
||||
|
||||
/* Fill the window with any remaining input. */ |
||||
have = s->window_size - s->strstart; |
||||
if (s->strm->avail_in > have && s->block_start >= (int)s->w_size) { |
||||
/* Slide the window down. */ |
||||
s->block_start -= (int)s->w_size; |
||||
s->strstart -= s->w_size; |
||||
memcpy(s->window, s->window + s->w_size, s->strstart); |
||||
if (s->matches < 2) |
||||
s->matches++; /* add a pending slide_hash() */ |
||||
have += s->w_size; /* more space now */ |
||||
s->insert = MIN(s->insert, s->strstart); |
||||
} |
||||
|
||||
have = MIN(have, s->strm->avail_in); |
||||
if (have) { |
||||
PREFIX(read_buf)(s->strm, s->window + s->strstart, have); |
||||
s->strstart += have; |
||||
s->insert += MIN(have, s->w_size - s->insert); |
||||
} |
||||
s->high_water = MAX(s->high_water, s->strstart); |
||||
|
||||
/* There was not enough avail_out to write a complete worthy or flushed
|
||||
* stored block to next_out. Write a stored block to pending instead, if we |
||||
* have enough input for a worthy block, or if flushing and there is enough |
||||
* room for the remaining input as a stored block in the pending buffer. |
||||
*/ |
||||
have = (s->bi_valid + 42) >> 3; /* number of header bytes */ |
||||
/* maximum stored block length that will fit in pending: */ |
||||
have = MIN(s->pending_buf_size - have, MAX_STORED); |
||||
min_block = MIN(have, s->w_size); |
||||
left = (int)s->strstart - s->block_start; |
||||
if (left >= min_block || ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH && s->strm->avail_in == 0 && left <= have)) { |
||||
len = MIN(left, have); |
||||
last = flush == Z_FINISH && s->strm->avail_in == 0 && len == left ? 1 : 0; |
||||
zng_tr_stored_block(s, (char *)s->window + s->block_start, len, last); |
||||
s->block_start += (int)len; |
||||
PREFIX(flush_pending)(s->strm); |
||||
} |
||||
|
||||
/* We've done all we can with the available input and output. */ |
||||
return last ? finish_started : need_more; |
||||
} |
@ -0,0 +1,50 @@ |
||||
#ifndef FALLBACK_BUILTINS_H |
||||
#define FALLBACK_BUILTINS_H |
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__) |
||||
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) |
||||
|
||||
#include <intrin.h> |
||||
#ifdef X86_FEATURES |
||||
# include "arch/x86/x86_features.h" |
||||
#endif |
||||
|
||||
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
|
||||
* Because of that assumption trailing_zero is not initialized and the return value is not checked. |
||||
* Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed. |
||||
* If tzcnt instruction is not supported, the cpu will itself execute bsf instead. |
||||
* Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu. |
||||
*/ |
||||
static __forceinline int __builtin_ctz(unsigned int value) { |
||||
Assert(value != 0, "Invalid input value: 0"); |
||||
# if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
||||
return (int)_tzcnt_u32(value); |
||||
# else |
||||
unsigned long trailing_zero; |
||||
_BitScanForward(&trailing_zero, value); |
||||
return (int)trailing_zero; |
||||
# endif |
||||
} |
||||
#define HAVE_BUILTIN_CTZ |
||||
|
||||
#ifdef _M_AMD64 |
||||
/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
|
||||
* Because of that assumption trailing_zero is not initialized and the return value is not checked. |
||||
*/ |
||||
static __forceinline int __builtin_ctzll(unsigned long long value) { |
||||
Assert(value != 0, "Invalid input value: 0"); |
||||
# if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
||||
return (int)_tzcnt_u64(value); |
||||
# else |
||||
unsigned long trailing_zero; |
||||
_BitScanForward64(&trailing_zero, value); |
||||
return (int)trailing_zero; |
||||
# endif |
||||
} |
||||
#define HAVE_BUILTIN_CTZLL |
||||
#endif // Microsoft AMD64
|
||||
|
||||
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
|
||||
#endif // _MSC_VER & !clang
|
||||
|
||||
#endif // include guard FALLBACK_BUILTINS_H
|
@ -0,0 +1,403 @@ |
||||
/* functable.c -- Choose relevant optimized functions at runtime
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zendian.h" |
||||
#include "crc32_braid_p.h" |
||||
#include "deflate.h" |
||||
#include "deflate_p.h" |
||||
#include "functable.h" |
||||
#include "cpu_features.h" |
||||
|
||||
#if defined(_MSC_VER) |
||||
# include <intrin.h> |
||||
#endif |
||||
|
||||
/* Platform has pointer size atomic store */ |
||||
#if defined(__GNUC__) || defined(__clang__) |
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
||||
__atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
||||
# define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
||||
#elif defined(_MSC_VER) |
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
||||
_InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
||||
# if defined(_M_ARM) || defined(_M_ARM64) |
||||
# define FUNCTABLE_BARRIER() do { \ |
||||
_ReadWriteBarrier(); \
|
||||
__dmb(0xB); /* _ARM_BARRIER_ISH */ \
|
||||
_ReadWriteBarrier(); \
|
||||
} while (0) |
||||
# else |
||||
# define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
||||
# endif |
||||
#else |
||||
# warning Unable to detect atomic intrinsic support. |
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
||||
*((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
||||
# define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
||||
#endif |
||||
|
||||
static void force_init_empty(void) { |
||||
// empty
|
||||
} |
||||
|
||||
static void init_functable(void) { |
||||
struct functable_s ft; |
||||
struct cpu_features cf; |
||||
|
||||
cpu_check_features(&cf); |
||||
|
||||
// Generic code
|
||||
ft.force_init = &force_init_empty; |
||||
ft.adler32 = &adler32_c; |
||||
ft.adler32_fold_copy = &adler32_fold_copy_c; |
||||
ft.chunkmemset_safe = &chunkmemset_safe_c; |
||||
ft.chunksize = &chunksize_c; |
||||
ft.crc32 = &PREFIX(crc32_braid); |
||||
ft.crc32_fold = &crc32_fold_c; |
||||
ft.crc32_fold_copy = &crc32_fold_copy_c; |
||||
ft.crc32_fold_final = &crc32_fold_final_c; |
||||
ft.crc32_fold_reset = &crc32_fold_reset_c; |
||||
ft.inflate_fast = &inflate_fast_c; |
||||
ft.insert_string = &insert_string_c; |
||||
ft.quick_insert_string = &quick_insert_string_c; |
||||
ft.slide_hash = &slide_hash_c; |
||||
ft.update_hash = &update_hash_c; |
||||
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN |
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
||||
ft.longest_match = &longest_match_unaligned_64; |
||||
ft.longest_match_slow = &longest_match_slow_unaligned_64; |
||||
ft.compare256 = &compare256_unaligned_64; |
||||
# elif defined(HAVE_BUILTIN_CTZ) |
||||
ft.longest_match = &longest_match_unaligned_32; |
||||
ft.longest_match_slow = &longest_match_slow_unaligned_32; |
||||
ft.compare256 = &compare256_unaligned_32; |
||||
# else |
||||
ft.longest_match = &longest_match_unaligned_16; |
||||
ft.longest_match_slow = &longest_match_slow_unaligned_16; |
||||
ft.compare256 = &compare256_unaligned_16; |
||||
# endif |
||||
#else |
||||
ft.longest_match = &longest_match_c; |
||||
ft.longest_match_slow = &longest_match_slow_c; |
||||
ft.compare256 = &compare256_c; |
||||
#endif |
||||
|
||||
|
||||
// Select arch-optimized functions
|
||||
|
||||
// X86 - SSE2
|
||||
#ifdef X86_SSE2 |
||||
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
||||
if (cf.x86.has_sse2) |
||||
# endif |
||||
{ |
||||
ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
||||
ft.chunksize = &chunksize_sse2; |
||||
ft.inflate_fast = &inflate_fast_sse2; |
||||
ft.slide_hash = &slide_hash_sse2; |
||||
# ifdef HAVE_BUILTIN_CTZ |
||||
ft.compare256 = &compare256_sse2; |
||||
ft.longest_match = &longest_match_sse2; |
||||
ft.longest_match_slow = &longest_match_slow_sse2; |
||||
# endif |
||||
} |
||||
#endif |
||||
// X86 - SSSE3
|
||||
#ifdef X86_SSSE3 |
||||
if (cf.x86.has_ssse3) { |
||||
ft.adler32 = &adler32_ssse3; |
||||
# ifdef X86_SSE2 |
||||
ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
||||
ft.inflate_fast = &inflate_fast_ssse3; |
||||
# endif |
||||
} |
||||
#endif |
||||
// X86 - SSE4.2
|
||||
#ifdef X86_SSE42 |
||||
if (cf.x86.has_sse42) { |
||||
ft.adler32_fold_copy = &adler32_fold_copy_sse42; |
||||
ft.insert_string = &insert_string_sse42; |
||||
ft.quick_insert_string = &quick_insert_string_sse42; |
||||
ft.update_hash = &update_hash_sse42; |
||||
} |
||||
#endif |
||||
// X86 - PCLMUL
|
||||
#ifdef X86_PCLMULQDQ_CRC |
||||
if (cf.x86.has_pclmulqdq) { |
||||
ft.crc32 = &crc32_pclmulqdq; |
||||
ft.crc32_fold = &crc32_fold_pclmulqdq; |
||||
ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy; |
||||
ft.crc32_fold_final = &crc32_fold_pclmulqdq_final; |
||||
ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset; |
||||
} |
||||
#endif |
||||
// X86 - AVX
|
||||
#ifdef X86_AVX2 |
||||
if (cf.x86.has_avx2) { |
||||
ft.adler32 = &adler32_avx2; |
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx2; |
||||
ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
||||
ft.chunksize = &chunksize_avx2; |
||||
ft.inflate_fast = &inflate_fast_avx2; |
||||
ft.slide_hash = &slide_hash_avx2; |
||||
# ifdef HAVE_BUILTIN_CTZ |
||||
ft.compare256 = &compare256_avx2; |
||||
ft.longest_match = &longest_match_avx2; |
||||
ft.longest_match_slow = &longest_match_slow_avx2; |
||||
# endif |
||||
} |
||||
#endif |
||||
#ifdef X86_AVX512 |
||||
if (cf.x86.has_avx512) { |
||||
ft.adler32 = &adler32_avx512; |
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx512; |
||||
} |
||||
#endif |
||||
#ifdef X86_AVX512VNNI |
||||
if (cf.x86.has_avx512vnni) { |
||||
ft.adler32 = &adler32_avx512_vnni; |
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; |
||||
} |
||||
#endif |
||||
// X86 - VPCLMULQDQ
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) |
||||
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) { |
||||
ft.crc32 = &crc32_vpclmulqdq; |
||||
ft.crc32_fold = &crc32_fold_vpclmulqdq; |
||||
ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; |
||||
ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final; |
||||
ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset; |
||||
} |
||||
#endif |
||||
|
||||
|
||||
// ARM - SIMD
|
||||
#ifdef ARM_SIMD |
||||
# ifndef ARM_NOCHECK_SIMD |
||||
if (cf.arm.has_simd) |
||||
# endif |
||||
{ |
||||
ft.slide_hash = &slide_hash_armv6; |
||||
} |
||||
#endif |
||||
// ARM - NEON
|
||||
#ifdef ARM_NEON |
||||
# ifndef ARM_NOCHECK_NEON |
||||
if (cf.arm.has_neon) |
||||
# endif |
||||
{ |
||||
ft.adler32 = &adler32_neon; |
||||
ft.chunkmemset_safe = &chunkmemset_safe_neon; |
||||
ft.chunksize = &chunksize_neon; |
||||
ft.inflate_fast = &inflate_fast_neon; |
||||
ft.slide_hash = &slide_hash_neon; |
||||
# ifdef HAVE_BUILTIN_CTZLL |
||||
ft.compare256 = &compare256_neon; |
||||
ft.longest_match = &longest_match_neon; |
||||
ft.longest_match_slow = &longest_match_slow_neon; |
||||
# endif |
||||
} |
||||
#endif |
||||
// ARM - ACLE
|
||||
#ifdef ARM_ACLE |
||||
if (cf.arm.has_crc32) { |
||||
ft.crc32 = &crc32_acle; |
||||
ft.insert_string = &insert_string_acle; |
||||
ft.quick_insert_string = &quick_insert_string_acle; |
||||
ft.update_hash = &update_hash_acle; |
||||
} |
||||
#endif |
||||
|
||||
|
||||
// Power - VMX
|
||||
#ifdef PPC_VMX |
||||
if (cf.power.has_altivec) { |
||||
ft.adler32 = &adler32_vmx; |
||||
ft.slide_hash = &slide_hash_vmx; |
||||
} |
||||
#endif |
||||
// Power8 - VSX
|
||||
#ifdef POWER8_VSX |
||||
if (cf.power.has_arch_2_07) { |
||||
ft.adler32 = &adler32_power8; |
||||
ft.chunkmemset_safe = &chunkmemset_safe_power8; |
||||
ft.chunksize = &chunksize_power8; |
||||
ft.inflate_fast = &inflate_fast_power8; |
||||
ft.slide_hash = &slide_hash_power8; |
||||
} |
||||
#endif |
||||
#ifdef POWER8_VSX_CRC32 |
||||
if (cf.power.has_arch_2_07) |
||||
ft.crc32 = &crc32_power8; |
||||
#endif |
||||
// Power9
|
||||
#ifdef POWER9 |
||||
if (cf.power.has_arch_3_00) { |
||||
ft.compare256 = &compare256_power9; |
||||
ft.longest_match = &longest_match_power9; |
||||
ft.longest_match_slow = &longest_match_slow_power9; |
||||
} |
||||
#endif |
||||
|
||||
|
||||
// RISCV - RVV
|
||||
#ifdef RISCV_RVV |
||||
if (cf.riscv.has_rvv) { |
||||
ft.adler32 = &adler32_rvv; |
||||
ft.adler32_fold_copy = &adler32_fold_copy_rvv; |
||||
ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
||||
ft.chunksize = &chunksize_rvv; |
||||
ft.compare256 = &compare256_rvv; |
||||
ft.inflate_fast = &inflate_fast_rvv; |
||||
ft.longest_match = &longest_match_rvv; |
||||
ft.longest_match_slow = &longest_match_slow_rvv; |
||||
ft.slide_hash = &slide_hash_rvv; |
||||
} |
||||
#endif |
||||
|
||||
|
||||
// S390
|
||||
#ifdef S390_CRC32_VX |
||||
if (cf.s390.has_vx) |
||||
ft.crc32 = crc32_s390_vx; |
||||
#endif |
||||
|
||||
// Assign function pointers individually for atomic operation
|
||||
FUNCTABLE_ASSIGN(ft, force_init); |
||||
FUNCTABLE_ASSIGN(ft, adler32); |
||||
FUNCTABLE_ASSIGN(ft, adler32_fold_copy); |
||||
FUNCTABLE_ASSIGN(ft, chunkmemset_safe); |
||||
FUNCTABLE_ASSIGN(ft, chunksize); |
||||
FUNCTABLE_ASSIGN(ft, compare256); |
||||
FUNCTABLE_ASSIGN(ft, crc32); |
||||
FUNCTABLE_ASSIGN(ft, crc32_fold); |
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_copy); |
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_final); |
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_reset); |
||||
FUNCTABLE_ASSIGN(ft, inflate_fast); |
||||
FUNCTABLE_ASSIGN(ft, insert_string); |
||||
FUNCTABLE_ASSIGN(ft, longest_match); |
||||
FUNCTABLE_ASSIGN(ft, longest_match_slow); |
||||
FUNCTABLE_ASSIGN(ft, quick_insert_string); |
||||
FUNCTABLE_ASSIGN(ft, slide_hash); |
||||
FUNCTABLE_ASSIGN(ft, update_hash); |
||||
|
||||
// Memory barrier for weak memory order CPUs
|
||||
FUNCTABLE_BARRIER(); |
||||
} |
||||
|
||||
/* stub functions */ |
||||
static void force_init_stub(void) { |
||||
init_functable(); |
||||
} |
||||
|
||||
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
||||
init_functable(); |
||||
return functable.adler32(adler, buf, len); |
||||
} |
||||
|
||||
static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
||||
init_functable(); |
||||
return functable.adler32_fold_copy(adler, dst, src, len); |
||||
} |
||||
|
||||
static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) { |
||||
init_functable(); |
||||
return functable.chunkmemset_safe(out, dist, len, left); |
||||
} |
||||
|
||||
static uint32_t chunksize_stub(void) { |
||||
init_functable(); |
||||
return functable.chunksize(); |
||||
} |
||||
|
||||
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
||||
init_functable(); |
||||
return functable.compare256(src0, src1); |
||||
} |
||||
|
||||
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
||||
init_functable(); |
||||
return functable.crc32(crc, buf, len); |
||||
} |
||||
|
||||
static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) { |
||||
init_functable(); |
||||
functable.crc32_fold(crc, src, len, init_crc); |
||||
} |
||||
|
||||
static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) { |
||||
init_functable(); |
||||
functable.crc32_fold_copy(crc, dst, src, len); |
||||
} |
||||
|
||||
static uint32_t crc32_fold_final_stub(crc32_fold* crc) { |
||||
init_functable(); |
||||
return functable.crc32_fold_final(crc); |
||||
} |
||||
|
||||
static uint32_t crc32_fold_reset_stub(crc32_fold* crc) { |
||||
init_functable(); |
||||
return functable.crc32_fold_reset(crc); |
||||
} |
||||
|
||||
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
||||
init_functable(); |
||||
functable.inflate_fast(strm, start); |
||||
} |
||||
|
||||
static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) { |
||||
init_functable(); |
||||
functable.insert_string(s, str, count); |
||||
} |
||||
|
||||
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) { |
||||
init_functable(); |
||||
return functable.longest_match(s, cur_match); |
||||
} |
||||
|
||||
static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) { |
||||
init_functable(); |
||||
return functable.longest_match_slow(s, cur_match); |
||||
} |
||||
|
||||
static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) { |
||||
init_functable(); |
||||
return functable.quick_insert_string(s, str); |
||||
} |
||||
|
||||
static void slide_hash_stub(deflate_state* s) { |
||||
init_functable(); |
||||
functable.slide_hash(s); |
||||
} |
||||
|
||||
static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) { |
||||
init_functable(); |
||||
return functable.update_hash(s, h, val); |
||||
} |
||||
|
||||
/* functable init */ |
||||
Z_INTERNAL struct functable_s functable = { |
||||
force_init_stub, |
||||
adler32_stub, |
||||
adler32_fold_copy_stub, |
||||
chunkmemset_safe_stub, |
||||
chunksize_stub, |
||||
compare256_stub, |
||||
crc32_stub, |
||||
crc32_fold_stub, |
||||
crc32_fold_copy_stub, |
||||
crc32_fold_final_stub, |
||||
crc32_fold_reset_stub, |
||||
inflate_fast_stub, |
||||
insert_string_stub, |
||||
longest_match_stub, |
||||
longest_match_slow_stub, |
||||
quick_insert_string_stub, |
||||
slide_hash_stub, |
||||
update_hash_stub |
||||
}; |
@ -0,0 +1,42 @@ |
||||
/* functable.h -- Struct containing function pointers to optimized functions
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifndef FUNCTABLE_H_ |
||||
#define FUNCTABLE_H_ |
||||
|
||||
#include "deflate.h" |
||||
#include "crc32_fold.h" |
||||
#include "adler32_fold.h" |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
typedef struct z_stream_s z_stream; |
||||
#else |
||||
typedef struct zng_stream_s zng_stream; |
||||
#endif |
||||
|
||||
struct functable_s { |
||||
void (* force_init) (void); |
||||
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len); |
||||
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
||||
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left); |
||||
uint32_t (* chunksize) (void); |
||||
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1); |
||||
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len); |
||||
void (* crc32_fold) (struct crc32_fold_s *crc, const uint8_t *src, size_t len, uint32_t init_crc); |
||||
void (* crc32_fold_copy) (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len); |
||||
uint32_t (* crc32_fold_final) (struct crc32_fold_s *crc); |
||||
uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc); |
||||
void (* inflate_fast) (PREFIX3(stream) *strm, uint32_t start); |
||||
void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count); |
||||
uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); |
||||
uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match); |
||||
Pos (* quick_insert_string)(deflate_state *const s, uint32_t str); |
||||
void (* slide_hash) (deflate_state *s); |
||||
uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val); |
||||
}; |
||||
|
||||
Z_INTERNAL extern struct functable_s functable; |
||||
|
||||
#endif |
@ -0,0 +1,144 @@ |
||||
#ifndef GZGUTS_H_ |
||||
#define GZGUTS_H_ |
||||
/* gzguts.h -- zlib internal header definitions for gz* operations
|
||||
* Copyright (C) 2004-2019 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#ifdef _LARGEFILE64_SOURCE |
||||
# ifndef _LARGEFILE_SOURCE |
||||
# define _LARGEFILE_SOURCE 1 |
||||
# endif |
||||
# undef _FILE_OFFSET_BITS |
||||
# undef _TIME_BITS |
||||
#endif |
||||
|
||||
#if defined(HAVE_VISIBILITY_INTERNAL) |
||||
# define Z_INTERNAL __attribute__((visibility ("internal"))) |
||||
#elif defined(HAVE_VISIBILITY_HIDDEN) |
||||
# define Z_INTERNAL __attribute__((visibility ("hidden"))) |
||||
#else |
||||
# define Z_INTERNAL |
||||
#endif |
||||
|
||||
#include <stdio.h> |
||||
#include <string.h> |
||||
#include <stdlib.h> |
||||
#include <limits.h> |
||||
#include <fcntl.h> |
||||
|
||||
#if defined(ZLIB_COMPAT) |
||||
# include "zlib.h" |
||||
#else |
||||
# include "zlib-ng.h" |
||||
#endif |
||||
|
||||
#ifdef _WIN32 |
||||
# include <stddef.h> |
||||
#endif |
||||
|
||||
#if defined(_WIN32) |
||||
# include <io.h> |
||||
# define WIDECHAR |
||||
#endif |
||||
|
||||
#ifdef WINAPI_FAMILY |
||||
# define open _open |
||||
# define read _read |
||||
# define write _write |
||||
# define close _close |
||||
#endif |
||||
|
||||
/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */ |
||||
#if !defined(STDC99) && !defined(__CYGWIN__) && !defined(__MINGW__) && defined(_WIN32) |
||||
# if !defined(vsnprintf) |
||||
# if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 ) |
||||
# define vsnprintf _vsnprintf |
||||
# endif |
||||
# endif |
||||
#endif |
||||
|
||||
/* unlike snprintf (which is required in C99), _snprintf does not guarantee
|
||||
null termination of the result -- however this is only used in gzlib.c |
||||
where the result is assured to fit in the space provided */ |
||||
#if defined(_MSC_VER) && _MSC_VER < 1900 |
||||
# define snprintf _snprintf |
||||
#endif |
||||
|
||||
/* get errno and strerror definition */ |
||||
#ifndef NO_STRERROR |
||||
# include <errno.h> |
||||
# define zstrerror() strerror(errno) |
||||
#else |
||||
# define zstrerror() "stdio error (consult errno)" |
||||
#endif |
||||
|
||||
/* default memLevel */ |
||||
#if MAX_MEM_LEVEL >= 8 |
||||
# define DEF_MEM_LEVEL 8 |
||||
#else |
||||
# define DEF_MEM_LEVEL MAX_MEM_LEVEL |
||||
#endif |
||||
|
||||
/* default i/o buffer size -- double this for output when reading (this and
|
||||
twice this must be able to fit in an unsigned type) */ |
||||
#ifndef GZBUFSIZE |
||||
# define GZBUFSIZE 131072 |
||||
#endif |
||||
|
||||
/* gzip modes, also provide a little integrity check on the passed structure */ |
||||
#define GZ_NONE 0 |
||||
#define GZ_READ 7247 |
||||
#define GZ_WRITE 31153 |
||||
#define GZ_APPEND 1 /* mode set to GZ_WRITE after the file is opened */ |
||||
|
||||
/* values for gz_state how */ |
||||
#define LOOK 0 /* look for a gzip header */ |
||||
#define COPY 1 /* copy input directly */ |
||||
#define GZIP 2 /* decompress a gzip stream */ |
||||
|
||||
/* internal gzip file state data structure */ |
||||
typedef struct { |
||||
/* exposed contents for gzgetc() macro */ |
||||
struct gzFile_s x; /* "x" for exposed */ |
||||
/* x.have: number of bytes available at x.next */ |
||||
/* x.next: next output data to deliver or write */ |
||||
/* x.pos: current position in uncompressed data */ |
||||
/* used for both reading and writing */ |
||||
int mode; /* see gzip modes above */ |
||||
int fd; /* file descriptor */ |
||||
char *path; /* path or fd for error messages */ |
||||
unsigned size; /* buffer size, zero if not allocated yet */ |
||||
unsigned want; /* requested buffer size, default is GZBUFSIZE */ |
||||
unsigned char *in; /* input buffer (double-sized when writing) */ |
||||
unsigned char *out; /* output buffer (double-sized when reading) */ |
||||
int direct; /* 0 if processing gzip, 1 if transparent */ |
||||
/* just for reading */ |
||||
int how; /* 0: get header, 1: copy, 2: decompress */ |
||||
z_off64_t start; /* where the gzip data started, for rewinding */ |
||||
int eof; /* true if end of input file reached */ |
||||
int past; /* true if read requested past end */ |
||||
/* just for writing */ |
||||
int level; /* compression level */ |
||||
int strategy; /* compression strategy */ |
||||
int reset; /* true if a reset is pending after a Z_FINISH */ |
||||
/* seek request */ |
||||
z_off64_t skip; /* amount to skip (already rewound if backwards) */ |
||||
int seek; /* true if seek request pending */ |
||||
/* error information */ |
||||
int err; /* error code */ |
||||
char *msg; /* error message */ |
||||
/* zlib inflate or deflate stream */ |
||||
PREFIX3(stream) strm; /* stream structure in-place (not a pointer) */ |
||||
} gz_state; |
||||
typedef gz_state *gz_statep; |
||||
|
||||
/* shared functions */ |
||||
void Z_INTERNAL gz_error(gz_state *, int, const char *); |
||||
|
||||
/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
|
||||
value -- needed when comparing unsigned to z_off64_t, which is signed |
||||
(possible z_off64_t types off_t, off64_t, and long are all signed) */ |
||||
#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX) |
||||
|
||||
#endif /* GZGUTS_H_ */ |
@ -0,0 +1,525 @@ |
||||
/* gzlib.c -- zlib functions common to reading and writing gzip files
|
||||
* Copyright (C) 2004-2019 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil_p.h" |
||||
#include "gzguts.h" |
||||
|
||||
#if defined(_WIN32) |
||||
# define LSEEK _lseeki64 |
||||
#else |
||||
#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0 |
||||
# define LSEEK lseek64 |
||||
#else |
||||
# define LSEEK lseek |
||||
#endif |
||||
#endif |
||||
|
||||
/* Local functions */ |
||||
static void gz_reset(gz_state *); |
||||
static gzFile gz_open(const void *, int, const char *); |
||||
|
||||
/* Reset gzip file state */ |
||||
static void gz_reset(gz_state *state) { |
||||
state->x.have = 0; /* no output data available */ |
||||
if (state->mode == GZ_READ) { /* for reading ... */ |
||||
state->eof = 0; /* not at end of file */ |
||||
state->past = 0; /* have not read past end yet */ |
||||
state->how = LOOK; /* look for gzip header */ |
||||
} |
||||
else /* for writing ... */ |
||||
state->reset = 0; /* no deflateReset pending */ |
||||
state->seek = 0; /* no seek request pending */ |
||||
gz_error(state, Z_OK, NULL); /* clear error */ |
||||
state->x.pos = 0; /* no uncompressed data yet */ |
||||
state->strm.avail_in = 0; /* no input data yet */ |
||||
} |
||||
|
||||
/* Open a gzip file either by name or file descriptor. */ |
||||
static gzFile gz_open(const void *path, int fd, const char *mode) { |
||||
gz_state *state; |
||||
size_t len; |
||||
int oflag; |
||||
#ifdef O_CLOEXEC |
||||
int cloexec = 0; |
||||
#endif |
||||
#ifdef O_EXCL |
||||
int exclusive = 0; |
||||
#endif |
||||
|
||||
/* check input */ |
||||
if (path == NULL) |
||||
return NULL; |
||||
|
||||
/* allocate gzFile structure to return */ |
||||
state = (gz_state *)zng_alloc(sizeof(gz_state)); |
||||
if (state == NULL) |
||||
return NULL; |
||||
state->size = 0; /* no buffers allocated yet */ |
||||
state->want = GZBUFSIZE; /* requested buffer size */ |
||||
state->msg = NULL; /* no error message yet */ |
||||
|
||||
/* interpret mode */ |
||||
state->mode = GZ_NONE; |
||||
state->level = Z_DEFAULT_COMPRESSION; |
||||
state->strategy = Z_DEFAULT_STRATEGY; |
||||
state->direct = 0; |
||||
while (*mode) { |
||||
if (*mode >= '0' && *mode <= '9') { |
||||
state->level = *mode - '0'; |
||||
} else { |
||||
switch (*mode) { |
||||
case 'r': |
||||
state->mode = GZ_READ; |
||||
break; |
||||
#ifndef NO_GZCOMPRESS |
||||
case 'w': |
||||
state->mode = GZ_WRITE; |
||||
break; |
||||
case 'a': |
||||
state->mode = GZ_APPEND; |
||||
break; |
||||
#endif |
||||
case '+': /* can't read and write at the same time */ |
||||
zng_free(state); |
||||
return NULL; |
||||
case 'b': /* ignore -- will request binary anyway */ |
||||
break; |
||||
#ifdef O_CLOEXEC |
||||
case 'e': |
||||
cloexec = 1; |
||||
break; |
||||
#endif |
||||
#ifdef O_EXCL |
||||
case 'x': |
||||
exclusive = 1; |
||||
break; |
||||
#endif |
||||
case 'f': |
||||
state->strategy = Z_FILTERED; |
||||
break; |
||||
case 'h': |
||||
state->strategy = Z_HUFFMAN_ONLY; |
||||
break; |
||||
case 'R': |
||||
state->strategy = Z_RLE; |
||||
break; |
||||
case 'F': |
||||
state->strategy = Z_FIXED; |
||||
break; |
||||
case 'T': |
||||
state->direct = 1; |
||||
break; |
||||
default: /* could consider as an error, but just ignore */ |
||||
{} |
||||
} |
||||
} |
||||
mode++; |
||||
} |
||||
|
||||
/* must provide an "r", "w", or "a" */ |
||||
if (state->mode == GZ_NONE) { |
||||
zng_free(state); |
||||
return NULL; |
||||
} |
||||
|
||||
/* can't force transparent read */ |
||||
if (state->mode == GZ_READ) { |
||||
if (state->direct) { |
||||
zng_free(state); |
||||
return NULL; |
||||
} |
||||
state->direct = 1; /* for empty file */ |
||||
} |
||||
|
||||
/* save the path name for error messages */ |
||||
#ifdef WIDECHAR |
||||
if (fd == -2) { |
||||
len = wcstombs(NULL, (const wchar_t *)path, 0); |
||||
if (len == (size_t)-1) |
||||
len = 0; |
||||
} else |
||||
#endif |
||||
len = strlen((const char *)path); |
||||
state->path = (char *)malloc(len + 1); |
||||
if (state->path == NULL) { |
||||
zng_free(state); |
||||
return NULL; |
||||
} |
||||
#ifdef WIDECHAR |
||||
if (fd == -2) |
||||
if (len) { |
||||
wcstombs(state->path, (const wchar_t *)path, len + 1); |
||||
} else { |
||||
*(state->path) = 0; |
||||
} |
||||
else |
||||
#endif |
||||
(void)snprintf(state->path, len + 1, "%s", (const char *)path); |
||||
|
||||
/* compute the flags for open() */ |
||||
oflag = |
||||
#ifdef O_LARGEFILE |
||||
O_LARGEFILE | |
||||
#endif |
||||
#ifdef O_BINARY |
||||
O_BINARY | |
||||
#endif |
||||
#ifdef O_CLOEXEC |
||||
(cloexec ? O_CLOEXEC : 0) | |
||||
#endif |
||||
(state->mode == GZ_READ ? |
||||
O_RDONLY : |
||||
(O_WRONLY | O_CREAT | |
||||
#ifdef O_EXCL |
||||
(exclusive ? O_EXCL : 0) | |
||||
#endif |
||||
(state->mode == GZ_WRITE ? |
||||
O_TRUNC : |
||||
O_APPEND))); |
||||
|
||||
/* open the file with the appropriate flags (or just use fd) */ |
||||
state->fd = fd > -1 ? fd : ( |
||||
#if defined(_WIN32) |
||||
fd == -2 ? _wopen((const wchar_t *)path, oflag, 0666) : |
||||
#elif __CYGWIN__ |
||||
fd == -2 ? open(state->path, oflag, 0666) : |
||||
#endif |
||||
open((const char *)path, oflag, 0666)); |
||||
if (state->fd == -1) { |
||||
free(state->path); |
||||
zng_free(state); |
||||
return NULL; |
||||
} |
||||
if (state->mode == GZ_APPEND) { |
||||
LSEEK(state->fd, 0, SEEK_END); /* so gzoffset() is correct */ |
||||
state->mode = GZ_WRITE; /* simplify later checks */ |
||||
} |
||||
|
||||
/* save the current position for rewinding (only if reading) */ |
||||
if (state->mode == GZ_READ) { |
||||
state->start = LSEEK(state->fd, 0, SEEK_CUR); |
||||
if (state->start == -1) state->start = 0; |
||||
} |
||||
|
||||
/* initialize stream */ |
||||
gz_reset(state); |
||||
|
||||
/* return stream */ |
||||
return (gzFile)state; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
gzFile Z_EXPORT PREFIX(gzopen)(const char *path, const char *mode) { |
||||
return gz_open(path, -1, mode); |
||||
} |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
gzFile Z_EXPORT PREFIX4(gzopen)(const char *path, const char *mode) { |
||||
return gz_open(path, -1, mode); |
||||
} |
||||
#endif |
||||
|
||||
/* -- see zlib.h -- */ |
||||
gzFile Z_EXPORT PREFIX(gzdopen)(int fd, const char *mode) { |
||||
char *path; /* identifier for error messages */ |
||||
gzFile gz; |
||||
|
||||
if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL) |
||||
return NULL; |
||||
(void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd); /* for debugging */ |
||||
gz = gz_open(path, fd, mode); |
||||
free(path); |
||||
return gz; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
#ifdef WIDECHAR |
||||
gzFile Z_EXPORT PREFIX(gzopen_w)(const wchar_t *path, const char *mode) { |
||||
return gz_open(path, -2, mode); |
||||
} |
||||
#endif |
||||
|
||||
int Z_EXPORT PREFIX(gzclose)(gzFile file) { |
||||
#ifndef NO_GZCOMPRESS |
||||
gz_state *state; |
||||
|
||||
if (file == NULL) |
||||
return Z_STREAM_ERROR; |
||||
state = (gz_state *)file; |
||||
|
||||
return state->mode == GZ_READ ? PREFIX(gzclose_r)(file) : PREFIX(gzclose_w)(file); |
||||
#else |
||||
return PREFIX(gzclose_r)(file); |
||||
#endif |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzbuffer)(gzFile file, unsigned size) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return -1; |
||||
|
||||
/* make sure we haven't already allocated memory */ |
||||
if (state->size != 0) |
||||
return -1; |
||||
|
||||
/* check and set requested size */ |
||||
if ((size << 1) < size) |
||||
return -1; /* need to be able to double it */ |
||||
if (size < 8) |
||||
size = 8; /* needed to behave well with flushing */ |
||||
state->want = size; |
||||
return 0; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzrewind)(gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading and that there's no error */ |
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return -1; |
||||
|
||||
/* back up and start over */ |
||||
if (LSEEK(state->fd, state->start, SEEK_SET) == -1) |
||||
return -1; |
||||
gz_reset(state); |
||||
return 0; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
z_off64_t Z_EXPORT PREFIX4(gzseek)(gzFile file, z_off64_t offset, int whence) { |
||||
unsigned n; |
||||
z_off64_t ret; |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return -1; |
||||
|
||||
/* check that there's no error */ |
||||
if (state->err != Z_OK && state->err != Z_BUF_ERROR) |
||||
return -1; |
||||
|
||||
/* can only seek from start or relative to current position */ |
||||
if (whence != SEEK_SET && whence != SEEK_CUR) |
||||
return -1; |
||||
|
||||
/* normalize offset to a SEEK_CUR specification */ |
||||
if (whence == SEEK_SET) |
||||
offset -= state->x.pos; |
||||
else if (state->seek) |
||||
offset += state->skip; |
||||
state->seek = 0; |
||||
|
||||
/* if within raw area while reading, just go there */ |
||||
if (state->mode == GZ_READ && state->how == COPY && state->x.pos + offset >= 0) { |
||||
ret = LSEEK(state->fd, offset - (z_off64_t)state->x.have, SEEK_CUR); |
||||
if (ret == -1) |
||||
return -1; |
||||
state->x.have = 0; |
||||
state->eof = 0; |
||||
state->past = 0; |
||||
state->seek = 0; |
||||
gz_error(state, Z_OK, NULL); |
||||
state->strm.avail_in = 0; |
||||
state->x.pos += offset; |
||||
return state->x.pos; |
||||
} |
||||
|
||||
/* calculate skip amount, rewinding if needed for back seek when reading */ |
||||
if (offset < 0) { |
||||
if (state->mode != GZ_READ) /* writing -- can't go backwards */ |
||||
return -1; |
||||
offset += state->x.pos; |
||||
if (offset < 0) /* before start of file! */ |
||||
return -1; |
||||
if (PREFIX(gzrewind)(file) == -1) /* rewind, then skip to offset */ |
||||
return -1; |
||||
} |
||||
|
||||
/* if reading, skip what's in output buffer (one less gzgetc() check) */ |
||||
if (state->mode == GZ_READ) { |
||||
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ? (unsigned)offset : state->x.have; |
||||
state->x.have -= n; |
||||
state->x.next += n; |
||||
state->x.pos += n; |
||||
offset -= n; |
||||
} |
||||
|
||||
/* request skip (if not zero) */ |
||||
if (offset) { |
||||
state->seek = 1; |
||||
state->skip = offset; |
||||
} |
||||
return state->x.pos + offset; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
#ifdef ZLIB_COMPAT |
||||
z_off_t Z_EXPORT PREFIX(gzseek)(gzFile file, z_off_t offset, int whence) { |
||||
z_off64_t ret; |
||||
|
||||
ret = PREFIX4(gzseek)(file, (z_off64_t)offset, whence); |
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1; |
||||
} |
||||
#endif |
||||
|
||||
/* -- see zlib.h -- */ |
||||
z_off64_t Z_EXPORT PREFIX4(gztell)(gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return -1; |
||||
|
||||
/* return position */ |
||||
return state->x.pos + (state->seek ? state->skip : 0); |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
#ifdef ZLIB_COMPAT |
||||
z_off_t Z_EXPORT PREFIX(gztell)(gzFile file) { |
||||
|
||||
z_off64_t ret; |
||||
|
||||
ret = PREFIX4(gztell)(file); |
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1; |
||||
} |
||||
#endif |
||||
|
||||
/* -- see zlib.h -- */ |
||||
z_off64_t Z_EXPORT PREFIX4(gzoffset)(gzFile file) { |
||||
z_off64_t offset; |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return -1; |
||||
|
||||
/* compute and return effective offset in file */ |
||||
offset = LSEEK(state->fd, 0, SEEK_CUR); |
||||
if (offset == -1) |
||||
return -1; |
||||
if (state->mode == GZ_READ) /* reading */ |
||||
offset -= state->strm.avail_in; /* don't count buffered input */ |
||||
return offset; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
#ifdef ZLIB_COMPAT |
||||
z_off_t Z_EXPORT PREFIX(gzoffset)(gzFile file) { |
||||
z_off64_t ret; |
||||
|
||||
ret = PREFIX4(gzoffset)(file); |
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1; |
||||
} |
||||
#endif |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzeof)(gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return 0; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return 0; |
||||
|
||||
/* return end-of-file state */ |
||||
return state->mode == GZ_READ ? state->past : 0; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
const char * Z_EXPORT PREFIX(gzerror)(gzFile file, int *errnum) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return NULL; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return NULL; |
||||
|
||||
/* return error information */ |
||||
if (errnum != NULL) |
||||
*errnum = state->err; |
||||
return state->err == Z_MEM_ERROR ? "out of memory" : (state->msg == NULL ? "" : state->msg); |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
void Z_EXPORT PREFIX(gzclearerr)(gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure and check integrity */ |
||||
if (file == NULL) |
||||
return; |
||||
state = (gz_state *)file; |
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE) |
||||
return; |
||||
|
||||
/* clear error and end-of-file */ |
||||
if (state->mode == GZ_READ) { |
||||
state->eof = 0; |
||||
state->past = 0; |
||||
} |
||||
gz_error(state, Z_OK, NULL); |
||||
} |
||||
|
||||
/* Create an error message in allocated memory and set state->err and
|
||||
state->msg accordingly. Free any previous error message already there. Do |
||||
not try to free or allocate space if the error is Z_MEM_ERROR (out of |
||||
memory). Simply save the error message as a static string. If there is an |
||||
allocation failure constructing the error message, then convert the error to |
||||
out of memory. */ |
||||
void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) { |
||||
/* free previously allocated message and clear */ |
||||
if (state->msg != NULL) { |
||||
if (state->err != Z_MEM_ERROR) |
||||
free(state->msg); |
||||
state->msg = NULL; |
||||
} |
||||
|
||||
/* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */ |
||||
if (err != Z_OK && err != Z_BUF_ERROR) |
||||
state->x.have = 0; |
||||
|
||||
/* set error code, and if no message, then done */ |
||||
state->err = err; |
||||
if (msg == NULL) |
||||
return; |
||||
|
||||
/* for an out of memory error, return literal string when requested */ |
||||
if (err == Z_MEM_ERROR) |
||||
return; |
||||
|
||||
/* construct error message with path */ |
||||
if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) == NULL) { |
||||
state->err = Z_MEM_ERROR; |
||||
return; |
||||
} |
||||
(void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg); |
||||
} |
@ -0,0 +1,606 @@ |
||||
/* gzread.c -- zlib functions for reading gzip files
|
||||
* Copyright (C) 2004-2017 Mark Adler |
||||
* For conditions of distribution and use, see copyright notice in zlib.h |
||||
*/ |
||||
|
||||
#include "zbuild.h" |
||||
#include "zutil_p.h" |
||||
#include "gzguts.h" |
||||
|
||||
/* Local functions */ |
||||
static int gz_load(gz_state *, unsigned char *, unsigned, unsigned *); |
||||
static int gz_avail(gz_state *); |
||||
static int gz_look(gz_state *); |
||||
static int gz_decomp(gz_state *); |
||||
static int gz_fetch(gz_state *); |
||||
static int gz_skip(gz_state *, z_off64_t); |
||||
static size_t gz_read(gz_state *, void *, size_t); |
||||
|
||||
/* Use read() to load a buffer -- return -1 on error, otherwise 0. Read from
|
||||
state->fd, and update state->eof, state->err, and state->msg as appropriate. |
||||
This function needs to loop on read(), since read() is not guaranteed to |
||||
read the number of bytes requested, depending on the type of descriptor. */ |
||||
static int gz_load(gz_state *state, unsigned char *buf, unsigned len, unsigned *have) { |
||||
ssize_t ret; |
||||
|
||||
*have = 0; |
||||
do { |
||||
ret = read(state->fd, buf + *have, len - *have); |
||||
if (ret <= 0) |
||||
break; |
||||
*have += (unsigned)ret; |
||||
} while (*have < len); |
||||
if (ret < 0) { |
||||
gz_error(state, Z_ERRNO, zstrerror()); |
||||
return -1; |
||||
} |
||||
if (ret == 0) |
||||
state->eof = 1; |
||||
return 0; |
||||
} |
||||
|
||||
/* Load up input buffer and set eof flag if last data loaded -- return -1 on
|
||||
error, 0 otherwise. Note that the eof flag is set when the end of the input |
||||
file is reached, even though there may be unused data in the buffer. Once |
||||
that data has been used, no more attempts will be made to read the file. |
||||
If strm->avail_in != 0, then the current data is moved to the beginning of |
||||
the input buffer, and then the remainder of the buffer is loaded with the |
||||
available data from the input file. */ |
||||
static int gz_avail(gz_state *state) { |
||||
unsigned got; |
||||
PREFIX3(stream) *strm = &(state->strm); |
||||
|
||||
if (state->err != Z_OK && state->err != Z_BUF_ERROR) |
||||
return -1; |
||||
if (state->eof == 0) { |
||||
if (strm->avail_in) { /* copy what's there to the start */ |
||||
unsigned char *p = state->in; |
||||
unsigned const char *q = strm->next_in; |
||||
unsigned n = strm->avail_in; |
||||
do { |
||||
*p++ = *q++; |
||||
} while (--n); |
||||
} |
||||
if (gz_load(state, state->in + strm->avail_in, state->size - strm->avail_in, &got) == -1) |
||||
return -1; |
||||
strm->avail_in += got; |
||||
strm->next_in = state->in; |
||||
} |
||||
return 0; |
||||
} |
||||
|
||||
/* Look for gzip header, set up for inflate or copy. state->x.have must be 0.
|
||||
If this is the first time in, allocate required memory. state->how will be |
||||
left unchanged if there is no more input data available, will be set to COPY |
||||
if there is no gzip header and direct copying will be performed, or it will |
||||
be set to GZIP for decompression. If direct copying, then leftover input |
||||
data from the input buffer will be copied to the output buffer. In that |
||||
case, all further file reads will be directly to either the output buffer or |
||||
a user buffer. If decompressing, the inflate state will be initialized. |
||||
gz_look() will return 0 on success or -1 on failure. */ |
||||
static int gz_look(gz_state *state) { |
||||
PREFIX3(stream) *strm = &(state->strm); |
||||
|
||||
/* allocate read buffers and inflate memory */ |
||||
if (state->size == 0) { |
||||
/* allocate buffers */ |
||||
state->in = (unsigned char *)zng_alloc(state->want); |
||||
state->out = (unsigned char *)zng_alloc(state->want << 1); |
||||
if (state->in == NULL || state->out == NULL) { |
||||
zng_free(state->out); |
||||
zng_free(state->in); |
||||
gz_error(state, Z_MEM_ERROR, "out of memory"); |
||||
return -1; |
||||
} |
||||
state->size = state->want; |
||||
|
||||
/* allocate inflate memory */ |
||||
state->strm.zalloc = NULL; |
||||
state->strm.zfree = NULL; |
||||
state->strm.opaque = NULL; |
||||
state->strm.avail_in = 0; |
||||
state->strm.next_in = NULL; |
||||
if (PREFIX(inflateInit2)(&(state->strm), MAX_WBITS + 16) != Z_OK) { /* gunzip */ |
||||
zng_free(state->out); |
||||
zng_free(state->in); |
||||
state->size = 0; |
||||
gz_error(state, Z_MEM_ERROR, "out of memory"); |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
/* get at least the magic bytes in the input buffer */ |
||||
if (strm->avail_in < 2) { |
||||
if (gz_avail(state) == -1) |
||||
return -1; |
||||
if (strm->avail_in == 0) |
||||
return 0; |
||||
} |
||||
|
||||
/* look for gzip magic bytes -- if there, do gzip decoding (note: there is
|
||||
a logical dilemma here when considering the case of a partially written |
||||
gzip file, to wit, if a single 31 byte is written, then we cannot tell |
||||
whether this is a single-byte file, or just a partially written gzip |
||||
file -- for here we assume that if a gzip file is being written, then |
||||
the header will be written in a single operation, so that reading a |
||||
single byte is sufficient indication that it is not a gzip file) */ |
||||
if (strm->avail_in > 1 && |
||||
strm->next_in[0] == 31 && strm->next_in[1] == 139) { |
||||
PREFIX(inflateReset)(strm); |
||||
state->how = GZIP; |
||||
state->direct = 0; |
||||
return 0; |
||||
} |
||||
|
||||
/* no gzip header -- if we were decoding gzip before, then this is trailing
|
||||
garbage. Ignore the trailing garbage and finish. */ |
||||
if (state->direct == 0) { |
||||
strm->avail_in = 0; |
||||
state->eof = 1; |
||||
state->x.have = 0; |
||||
return 0; |
||||
} |
||||
|
||||
/* doing raw i/o, copy any leftover input to output -- this assumes that
|
||||
the output buffer is larger than the input buffer, which also assures |
||||
space for gzungetc() */ |
||||
state->x.next = state->out; |
||||
memcpy(state->x.next, strm->next_in, strm->avail_in); |
||||
state->x.have = strm->avail_in; |
||||
strm->avail_in = 0; |
||||
state->how = COPY; |
||||
state->direct = 1; |
||||
return 0; |
||||
} |
||||
|
||||
/* Decompress from input to the provided next_out and avail_out in the state.
|
||||
On return, state->x.have and state->x.next point to the just decompressed |
||||
data. If the gzip stream completes, state->how is reset to LOOK to look for |
||||
the next gzip stream or raw data, once state->x.have is depleted. Returns 0 |
||||
on success, -1 on failure. */ |
||||
static int gz_decomp(gz_state *state) { |
||||
int ret = Z_OK; |
||||
unsigned had; |
||||
PREFIX3(stream) *strm = &(state->strm); |
||||
|
||||
/* fill output buffer up to end of deflate stream */ |
||||
had = strm->avail_out; |
||||
do { |
||||
/* get more input for inflate() */ |
||||
if (strm->avail_in == 0 && gz_avail(state) == -1) |
||||
return -1; |
||||
if (strm->avail_in == 0) { |
||||
gz_error(state, Z_BUF_ERROR, "unexpected end of file"); |
||||
break; |
||||
} |
||||
|
||||
/* decompress and handle errors */ |
||||
ret = PREFIX(inflate)(strm, Z_NO_FLUSH); |
||||
if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) { |
||||
gz_error(state, Z_STREAM_ERROR, "internal error: inflate stream corrupt"); |
||||
return -1; |
||||
} |
||||
if (ret == Z_MEM_ERROR) { |
||||
gz_error(state, Z_MEM_ERROR, "out of memory"); |
||||
return -1; |
||||
} |
||||
if (ret == Z_DATA_ERROR) { /* deflate stream invalid */ |
||||
gz_error(state, Z_DATA_ERROR, strm->msg == NULL ? "compressed data error" : strm->msg); |
||||
return -1; |
||||
} |
||||
} while (strm->avail_out && ret != Z_STREAM_END); |
||||
|
||||
/* update available output */ |
||||
state->x.have = had - strm->avail_out; |
||||
state->x.next = strm->next_out - state->x.have; |
||||
|
||||
/* if the gzip stream completed successfully, look for another */ |
||||
if (ret == Z_STREAM_END) |
||||
state->how = LOOK; |
||||
|
||||
/* good decompression */ |
||||
return 0; |
||||
} |
||||
|
||||
/* Fetch data and put it in the output buffer. Assumes state->x.have is 0.
|
||||
Data is either copied from the input file or decompressed from the input |
||||
file depending on state->how. If state->how is LOOK, then a gzip header is |
||||
looked for to determine whether to copy or decompress. Returns -1 on error, |
||||
otherwise 0. gz_fetch() will leave state->how as COPY or GZIP unless the |
||||
end of the input file has been reached and all data has been processed. */ |
||||
static int gz_fetch(gz_state *state) { |
||||
PREFIX3(stream) *strm = &(state->strm); |
||||
|
||||
do { |
||||
switch (state->how) { |
||||
case LOOK: /* -> LOOK, COPY (only if never GZIP), or GZIP */ |
||||
if (gz_look(state) == -1) |
||||
return -1; |
||||
if (state->how == LOOK) |
||||
return 0; |
||||
break; |
||||
case COPY: /* -> COPY */ |
||||
if (gz_load(state, state->out, state->size << 1, &(state->x.have)) |
||||
== -1) |
||||
return -1; |
||||
state->x.next = state->out; |
||||
return 0; |
||||
case GZIP: /* -> GZIP or LOOK (if end of gzip stream) */ |
||||
strm->avail_out = state->size << 1; |
||||
strm->next_out = state->out; |
||||
if (gz_decomp(state) == -1) |
||||
return -1; |
||||
} |
||||
} while (state->x.have == 0 && (!state->eof || strm->avail_in)); |
||||
return 0; |
||||
} |
||||
|
||||
/* Skip len uncompressed bytes of output. Return -1 on error, 0 on success. */ |
||||
static int gz_skip(gz_state *state, z_off64_t len) { |
||||
unsigned n; |
||||
|
||||
/* skip over len bytes or reach end-of-file, whichever comes first */ |
||||
while (len) |
||||
/* skip over whatever is in output buffer */ |
||||
if (state->x.have) { |
||||
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ? |
||||
(unsigned)len : state->x.have; |
||||
state->x.have -= n; |
||||
state->x.next += n; |
||||
state->x.pos += n; |
||||
len -= n; |
||||
} else if (state->eof && state->strm.avail_in == 0) { |
||||
/* output buffer empty -- return if we're at the end of the input */ |
||||
break; |
||||
} else { |
||||
/* need more data to skip -- load up output buffer */ |
||||
/* get more output, looking for header if required */ |
||||
if (gz_fetch(state) == -1) |
||||
return -1; |
||||
} |
||||
return 0; |
||||
} |
||||
|
||||
/* Read len bytes into buf from file, or less than len up to the end of the
|
||||
input. Return the number of bytes read. If zero is returned, either the |
||||
end of file was reached, or there was an error. state->err must be |
||||
consulted in that case to determine which. */ |
||||
static size_t gz_read(gz_state *state, void *buf, size_t len) { |
||||
size_t got; |
||||
unsigned n; |
||||
|
||||
/* if len is zero, avoid unnecessary operations */ |
||||
if (len == 0) |
||||
return 0; |
||||
|
||||
/* process a skip request */ |
||||
if (state->seek) { |
||||
state->seek = 0; |
||||
if (gz_skip(state, state->skip) == -1) |
||||
return 0; |
||||
} |
||||
|
||||
/* get len bytes to buf, or less than len if at the end */ |
||||
got = 0; |
||||
do { |
||||
/* set n to the maximum amount of len that fits in an unsigned int */ |
||||
n = (unsigned)-1; |
||||
if (n > len) |
||||
n = (unsigned)len; |
||||
|
||||
/* first just try copying data from the output buffer */ |
||||
if (state->x.have) { |
||||
if (state->x.have < n) |
||||
n = state->x.have; |
||||
memcpy(buf, state->x.next, n); |
||||
state->x.next += n; |
||||
state->x.have -= n; |
||||
} |
||||
|
||||
/* output buffer empty -- return if we're at the end of the input */ |
||||
else if (state->eof && state->strm.avail_in == 0) { |
||||
state->past = 1; /* tried to read past end */ |
||||
break; |
||||
} |
||||
|
||||
/* need output data -- for small len or new stream load up our output
|
||||
buffer */ |
||||
else if (state->how == LOOK || n < (state->size << 1)) { |
||||
/* get more output, looking for header if required */ |
||||
if (gz_fetch(state) == -1) |
||||
return 0; |
||||
continue; /* no progress yet -- go back to copy above */ |
||||
/* the copy above assures that we will leave with space in the
|
||||
output buffer, allowing at least one gzungetc() to succeed */ |
||||
} |
||||
|
||||
/* large len -- read directly into user buffer */ |
||||
else if (state->how == COPY) { /* read directly */ |
||||
if (gz_load(state, (unsigned char *)buf, n, &n) == -1) |
||||
return 0; |
||||
} |
||||
|
||||
/* large len -- decompress directly into user buffer */ |
||||
else { /* state->how == GZIP */ |
||||
state->strm.avail_out = n; |
||||
state->strm.next_out = (unsigned char *)buf; |
||||
if (gz_decomp(state) == -1) |
||||
return 0; |
||||
n = state->x.have; |
||||
state->x.have = 0; |
||||
} |
||||
|
||||
/* update progress */ |
||||
len -= n; |
||||
buf = (char *)buf + n; |
||||
got += n; |
||||
state->x.pos += n; |
||||
} while (len); |
||||
|
||||
/* return number of bytes read into user buffer */ |
||||
return got; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzread)(gzFile file, void *buf, unsigned len) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading and that there's no (serious) error */ |
||||
if (state->mode != GZ_READ || |
||||
(state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return -1; |
||||
|
||||
/* since an int is returned, make sure len fits in one, otherwise return
|
||||
with an error (this avoids a flaw in the interface) */ |
||||
if ((int)len < 0) { |
||||
gz_error(state, Z_STREAM_ERROR, "request does not fit in an int"); |
||||
return -1; |
||||
} |
||||
|
||||
/* read len or fewer bytes to buf */ |
||||
len = (unsigned)gz_read(state, buf, len); |
||||
|
||||
/* check for an error */ |
||||
if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR) |
||||
return -1; |
||||
|
||||
/* return the number of bytes read (this is assured to fit in an int) */ |
||||
return (int)len; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
size_t Z_EXPORT PREFIX(gzfread)(void *buf, size_t size, size_t nitems, gzFile file) { |
||||
size_t len; |
||||
gz_state *state; |
||||
|
||||
/* Exit early if size is zero, also prevents potential division by zero */ |
||||
if (size == 0) |
||||
return 0; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return 0; |
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading and that there's no (serious) error */ |
||||
if (state->mode != GZ_READ || |
||||
(state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return 0; |
||||
|
||||
/* compute bytes to read -- error on overflow */ |
||||
if (size && SIZE_MAX / size < nitems) { |
||||
gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t"); |
||||
return 0; |
||||
} |
||||
len = nitems * size; |
||||
|
||||
/* read len or fewer bytes to buf, return the number of full items read */ |
||||
return len ? gz_read(state, buf, len) / size : 0; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
#undef @ZLIB_SYMBOL_PREFIX@gzgetc |
||||
#undef @ZLIB_SYMBOL_PREFIX@zng_gzgetc |
||||
int Z_EXPORT PREFIX(gzgetc)(gzFile file) { |
||||
unsigned char buf[1]; |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading and that there's no (serious) error */ |
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return -1; |
||||
|
||||
/* try output buffer (no need to check for skip request) */ |
||||
if (state->x.have) { |
||||
state->x.have--; |
||||
state->x.pos++; |
||||
return *(state->x.next)++; |
||||
} |
||||
|
||||
/* nothing there -- try gz_read() */ |
||||
return gz_read(state, buf, 1) < 1 ? -1 : buf[0]; |
||||
} |
||||
|
||||
#ifdef ZLIB_COMPAT |
||||
int Z_EXPORT PREFIX(gzgetc_)(gzFile file) { |
||||
return PREFIX(gzgetc)(file); |
||||
} |
||||
#endif |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzungetc)(int c, gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return -1; |
||||
state = (gz_state *)file; |
||||
|
||||
/* in case this was just opened, set up the input buffer */ |
||||
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0) |
||||
(void)gz_look(state); |
||||
|
||||
/* check that we're reading and that there's no (serious) error */ |
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return -1; |
||||
|
||||
/* process a skip request */ |
||||
if (state->seek) { |
||||
state->seek = 0; |
||||
if (gz_skip(state, state->skip) == -1) |
||||
return -1; |
||||
} |
||||
|
||||
/* can't push EOF */ |
||||
if (c < 0) |
||||
return -1; |
||||
|
||||
/* if output buffer empty, put byte at end (allows more pushing) */ |
||||
if (state->x.have == 0) { |
||||
state->x.have = 1; |
||||
state->x.next = state->out + (state->size << 1) - 1; |
||||
state->x.next[0] = (unsigned char)c; |
||||
state->x.pos--; |
||||
state->past = 0; |
||||
return c; |
||||
} |
||||
|
||||
/* if no room, give up (must have already done a gzungetc()) */ |
||||
if (state->x.have == (state->size << 1)) { |
||||
gz_error(state, Z_DATA_ERROR, "out of room to push characters"); |
||||
return -1; |
||||
} |
||||
|
||||
/* slide output data if needed and insert byte before existing data */ |
||||
if (state->x.next == state->out) { |
||||
unsigned char *src = state->out + state->x.have; |
||||
unsigned char *dest = state->out + (state->size << 1); |
||||
while (src > state->out) |
||||
*--dest = *--src; |
||||
state->x.next = dest; |
||||
} |
||||
state->x.have++; |
||||
state->x.next--; |
||||
state->x.next[0] = (unsigned char)c; |
||||
state->x.pos--; |
||||
state->past = 0; |
||||
return c; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
char * Z_EXPORT PREFIX(gzgets)(gzFile file, char *buf, int len) { |
||||
unsigned left, n; |
||||
char *str; |
||||
unsigned char *eol; |
||||
gz_state *state; |
||||
|
||||
/* check parameters and get internal structure */ |
||||
if (file == NULL || buf == NULL || len < 1) |
||||
return NULL; |
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading and that there's no (serious) error */ |
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR)) |
||||
return NULL; |
||||
|
||||
/* process a skip request */ |
||||
if (state->seek) { |
||||
state->seek = 0; |
||||
if (gz_skip(state, state->skip) == -1) |
||||
return NULL; |
||||
} |
||||
|
||||
/* copy output bytes up to new line or len - 1, whichever comes first --
|
||||
append a terminating zero to the string (we don't check for a zero in |
||||
the contents, let the user worry about that) */ |
||||
str = buf; |
||||
left = (unsigned)len - 1; |
||||
if (left) { |
||||
do { |
||||
/* assure that something is in the output buffer */ |
||||
if (state->x.have == 0 && gz_fetch(state) == -1) |
||||
return NULL; /* error */ |
||||
if (state->x.have == 0) { /* end of file */ |
||||
state->past = 1; /* read past end */ |
||||
break; /* return what we have */ |
||||
} |
||||
|
||||
/* look for end-of-line in current output buffer */ |
||||
n = state->x.have > left ? left : state->x.have; |
||||
eol = (unsigned char *)memchr(state->x.next, '\n', n); |
||||
if (eol != NULL) |
||||
n = (unsigned)(eol - state->x.next) + 1; |
||||
|
||||
/* copy through end-of-line, or remainder if not found */ |
||||
memcpy(buf, state->x.next, n); |
||||
state->x.have -= n; |
||||
state->x.next += n; |
||||
state->x.pos += n; |
||||
left -= n; |
||||
buf += n; |
||||
} while (left && eol == NULL); |
||||
} |
||||
|
||||
/* return terminated string, or if nothing, end of file */ |
||||
if (buf == str) |
||||
return NULL; |
||||
buf[0] = 0; |
||||
return str; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzdirect)(gzFile file) { |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return 0; |
||||
|
||||
state = (gz_state *)file; |
||||
|
||||
/* if the state is not known, but we can find out, then do so (this is
|
||||
mainly for right after a gzopen() or gzdopen()) */ |
||||
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0) |
||||
(void)gz_look(state); |
||||
|
||||
/* return 1 if transparent, 0 if processing a gzip stream */ |
||||
return state->direct; |
||||
} |
||||
|
||||
/* -- see zlib.h -- */ |
||||
int Z_EXPORT PREFIX(gzclose_r)(gzFile file) { |
||||
int ret, err; |
||||
gz_state *state; |
||||
|
||||
/* get internal structure */ |
||||
if (file == NULL) |
||||
return Z_STREAM_ERROR; |
||||
|
||||
state = (gz_state *)file; |
||||
|
||||
/* check that we're reading */ |
||||
if (state->mode != GZ_READ) |
||||
return Z_STREAM_ERROR; |
||||
|
||||
/* free memory and close file */ |
||||
if (state->size) { |
||||
PREFIX(inflateEnd)(&(state->strm)); |
||||
zng_free(state->out); |
||||
zng_free(state->in); |
||||
} |
||||
err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK; |
||||
gz_error(state, Z_OK, NULL); |
||||
free(state->path); |
||||
ret = close(state->fd); |
||||
zng_free(state); |
||||
return ret ? Z_ERRNO : err; |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue