Release the CRC library

This implementation can advantage of hardware acceleration available
on common CPUs when using GCC and Clang. A future update may enable
this on MSVC as well.

PiperOrigin-RevId: 487327024
Change-Id: I99a8f1bcbdf25297e776537e23bd0a902e0818a1
pull/1272/merge
Derek Mauro 2 years ago committed by Copybara-Service
parent 8cfc1500f8
commit 1687dbf814
  1. 23
      CMake/AbseilDll.cmake
  2. 2
      README.md
  3. 1
      absl/CMakeLists.txt
  4. 174
      absl/crc/BUILD.bazel
  5. 146
      absl/crc/CMakeLists.txt
  6. 100
      absl/crc/crc32c.cc
  7. 176
      absl/crc/crc32c.h
  8. 162
      absl/crc/crc32c_benchmark.cc
  9. 186
      absl/crc/crc32c_test.cc
  10. 247
      absl/crc/internal/cpu_detect.cc
  11. 59
      absl/crc/internal/cpu_detect.h
  12. 468
      absl/crc/internal/crc.cc
  13. 91
      absl/crc/internal/crc.h
  14. 260
      absl/crc/internal/crc32_x86_arm_combined_simd.h
  15. 39
      absl/crc/internal/crc32c.h
  16. 72
      absl/crc/internal/crc32c_inline.h
  17. 177
      absl/crc/internal/crc_internal.h
  18. 112
      absl/crc/internal/crc_memcpy.h
  19. 75
      absl/crc/internal/crc_memcpy_fallback.cc
  20. 169
      absl/crc/internal/crc_memcpy_test.cc
  21. 435
      absl/crc/internal/crc_memcpy_x86_64.cc
  22. 93
      absl/crc/internal/crc_non_temporal_memcpy.cc
  23. 691
      absl/crc/internal/crc_x86_arm_combined.cc
  24. 77
      absl/crc/internal/non_temporal_arm_intrinsics.h
  25. 172
      absl/crc/internal/non_temporal_memcpy.h
  26. 88
      absl/crc/internal/non_temporal_memcpy_test.cc

@ -91,6 +91,24 @@ set(ABSL_INTERNAL_DLL_FILES
"container/internal/tracked.h"
"container/node_hash_map.h"
"container/node_hash_set.h"
"crc/crc32c.cc"
"crc/crc32c.h"
"crc/internal/cpu_detect.cc"
"crc/internal/cpu_detect.h"
"crc/internal/crc32c.h"
"crc/internal/crc32c_inline.h"
"crc/internal/crc32_x86_arm_combined_simd.h"
"crc/internal/crc.cc"
"crc/internal/crc.h"
"crc/internal/crc_internal.h"
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/crc_memcpy_fallback.cc"
"crc/internal/crc_memcpy.h"
"crc/internal/crc_memcpy_x86_64.cc"
"crc/internal/crc_non_temporal_memcpy.cc"
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/non_temporal_arm_intrinsics.h"
"crc/internal/non_temporal_memcpy.h"
"debugging/failure_signal_handler.cc"
"debugging/failure_signal_handler.h"
"debugging/leak_check.h"
@ -386,6 +404,9 @@ set(ABSL_INTERNAL_DLL_TARGETS
"cord"
"core_headers"
"counting_allocator"
"crc_cpu_detect",
"crc_internal",
"crc32c",
"debugging"
"debugging_internal"
"demangle_internal"
@ -418,6 +439,8 @@ set(ABSL_INTERNAL_DLL_TARGETS
"node_hash_map"
"node_hash_set"
"node_slot_policy"
"non_temporal_arm_intrinsics",
"non_temporal_memcpy",
"numeric"
"optional"
"periodic_sampler"

@ -80,6 +80,8 @@ Abseil contains the following C++ library components:
* [`container`](absl/container/)
<br /> The `container` library contains additional STL-style containers,
including Abseil's unordered "Swiss table" containers.
* [`crc`](absl/crc/) The `crc` library contains code for
computing error-detecting cyclic redundancy checks on data.
* [`debugging`](absl/debugging/)
<br /> The `debugging` library contains code useful for enabling leak
checks, and stacktrace and symbolization utilities.

@ -18,6 +18,7 @@ add_subdirectory(base)
add_subdirectory(algorithm)
add_subdirectory(cleanup)
add_subdirectory(container)
add_subdirectory(crc)
add_subdirectory(debugging)
add_subdirectory(flags)
add_subdirectory(functional)

@ -0,0 +1,174 @@
# Copyright 2022 The Abseil Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load(
"//absl:copts/configure_copts.bzl",
"ABSL_DEFAULT_COPTS",
"ABSL_DEFAULT_LINKOPTS",
"ABSL_TEST_COPTS",
)
package(default_visibility = ["//visibility:private"])
licenses(["notice"])
cc_library(
name = "cpu_detect",
srcs = [
"internal/cpu_detect.cc",
],
hdrs = ["internal/cpu_detect.h"],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:private"],
deps = [
"//absl/base",
"//absl/base:config",
],
)
cc_library(
name = "crc_internal",
srcs = [
"internal/crc.cc",
"internal/crc_internal.h",
"internal/crc_x86_arm_combined.cc",
],
hdrs = [
"internal/crc.h",
"internal/crc32_x86_arm_combined_simd.h",
],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:private"],
deps = [
":cpu_detect",
"//absl/base",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/base:raw_logging_internal",
"//absl/memory",
"//absl/numeric:bits",
],
)
cc_library(
name = "crc32c",
srcs = [
"crc32c.cc",
"internal/crc32c_inline.h",
"internal/crc_memcpy_fallback.cc",
"internal/crc_memcpy_x86_64.cc",
"internal/crc_non_temporal_memcpy.cc",
],
hdrs = [
"crc32c.h",
"internal/crc32c.h",
"internal/crc_memcpy.h",
],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:public"],
deps = [
":cpu_detect",
":crc_internal",
":non_temporal_memcpy",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/strings",
],
)
cc_test(
name = "crc32c_test",
srcs = ["crc32c_test.cc"],
copts = ABSL_TEST_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:private"],
deps = [
":crc32c",
"//absl/strings",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "non_temporal_arm_intrinsics",
hdrs = ["internal/non_temporal_arm_intrinsics.h"],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:private"],
)
cc_library(
name = "non_temporal_memcpy",
hdrs = ["internal/non_temporal_memcpy.h"],
copts = ABSL_DEFAULT_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
visibility = ["//visibility:private"],
deps = [
":non_temporal_arm_intrinsics",
"//absl/base:config",
"//absl/base:core_headers",
],
)
cc_test(
name = "crc_memcpy_test",
size = "large",
srcs = ["internal/crc_memcpy_test.cc"],
shard_count = 3,
visibility = ["//visibility:private"],
deps = [
":crc32c",
"//absl/memory",
"//absl/random",
"//absl/random:distributions",
"//absl/strings",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "non_temporal_memcpy_test",
srcs = ["internal/non_temporal_memcpy_test.cc"],
visibility = ["//visibility:private"],
deps = [
":non_temporal_memcpy",
"@com_google_googletest//:gtest_main",
],
)
cc_binary(
name = "crc32c_benchmark",
testonly = 1,
srcs = ["crc32c_benchmark.cc"],
copts = ABSL_TEST_COPTS,
linkopts = ABSL_DEFAULT_LINKOPTS,
tags = [
"benchmark",
],
visibility = ["//visibility:private"],
deps = [
":crc32c",
"//absl/memory",
"@com_github_google_benchmark//:benchmark_main",
],
)

@ -0,0 +1,146 @@
# Copyright 2022 The Abseil Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Internal-only target, do not depend on directly.
absl_cc_library(
NAME
crc_cpu_detect
HDRS
"internal/cpu_detect.h"
SRCS
"internal/cpu_detect.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::base
absl::config
)
# Internal-only target, do not depend on directly.
absl_cc_library(
NAME
crc_internal
HDRS
"internal/crc.h"
"internal/crc32_x86_arm_combined_simd.h"
SRCS
"internal/crc.cc"
"internal/crc_internal.h"
"internal/crc_x86_arm_combined.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc_cpu_detect
absl::base
absl::config
absl::core_headers
absl::dynamic_annotations
absl::endian
absl::prefetch
absl::raw_logging_internal
absl::memory
absl::bits
)
absl_cc_library(
NAME
crc32c
HDRS
"crc32c.h"
"internal/crc32c.h"
"internal/crc_memcpy.h"
SRCS
"crc32c.cc"
"internal/crc32c_inline.h"
"internal/crc_memcpy_fallback.cc"
"internal/crc_memcpy_x86_64.cc"
"internal/crc_non_temporal_memcpy.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc_cpu_detect
absl::crc_internal
absl::non_temporal_memcpy
absl::config
absl::core_headers
absl::dynamic_annotations
absl::endian
absl::prefetch
absl::strings
)
absl_cc_test(
NAME
crc32c_test
SRCS
"crc32c_test.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc32c
absl::strings
GTest::gtest_main
)
# Internal-only target, do not depend on directly.
absl_cc_library(
NAME
non_temporal_arm_intrinsics
HDRS
"internal/non_temporal_arm_intrinsics.h"
COPTS
${ABSL_DEFAULT_COPTS}
)
# Internal-only target, do not depend on directly.
absl_cc_library(
NAME
non_temporal_memcpy
HDRS
"internal/non_temporal_memcpy.h"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::non_temporal_arm_intrinsics
absl::config
absl::core_headers
)
absl_cc_test(
NAME
crc_memcpy_test
SRCS
"internal/crc_memcpy_test.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc32c
absl::memory
absl::random_random
absl::random_distributions
absl::strings
GTest::gtest_main
)
absl_cc_test(
NAME
non_temporal_memcpy_test
SRCS
"internal/non_temporal_memcpy_test.cc"
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::non_temporal_memcpy
GTest::gtest_main
)

@ -0,0 +1,100 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/crc/crc32c.h"
#include <cstdint>
#include "absl/crc/internal/crc.h"
#include "absl/crc/internal/crc32c.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace {
const crc_internal::CRC* CrcEngine() {
static const crc_internal::CRC* engine = crc_internal::CRC::Crc32c();
return engine;
}
constexpr uint32_t kCRC32Xor = 0xffffffffU;
} // namespace
namespace crc_internal {
crc32c_t UnextendCrc32cByZeroes(crc32c_t initial_crc, size_t length) {
uint32_t crc = static_cast<uint32_t>(initial_crc) ^ kCRC32Xor;
CrcEngine()->UnextendByZeroes(&crc, length);
return static_cast<crc32c_t>(crc ^ kCRC32Xor);
}
// Called by `absl::ExtendCrc32c()` on strings with size > 64 or when hardware
// CRC32C support is missing.
crc32c_t ExtendCrc32cInternal(crc32c_t initial_crc,
absl::string_view buf_to_add) {
uint32_t crc = static_cast<uint32_t>(initial_crc) ^ kCRC32Xor;
CrcEngine()->Extend(&crc, buf_to_add.data(), buf_to_add.size());
return static_cast<crc32c_t>(crc ^ kCRC32Xor);
}
} // namespace crc_internal
crc32c_t ComputeCrc32c(absl::string_view buf) {
return ExtendCrc32c(ToCrc32c(0), buf);
}
crc32c_t ExtendCrc32cByZeroes(crc32c_t initial_crc, size_t length) {
uint32_t crc = static_cast<uint32_t>(initial_crc) ^ kCRC32Xor;
CrcEngine()->ExtendByZeroes(&crc, length);
return static_cast<crc32c_t>(crc ^ kCRC32Xor);
}
crc32c_t ConcatCrc32c(crc32c_t lhs_crc, crc32c_t rhs_crc, size_t rhs_len) {
uint32_t result = static_cast<uint32_t>(lhs_crc);
CrcEngine()->ExtendByZeroes(&result, rhs_len);
return static_cast<crc32c_t>(result) ^ rhs_crc;
}
crc32c_t RemoveCrc32cPrefix(crc32c_t crc_a, crc32c_t crc_ab, size_t length_b) {
return ConcatCrc32c(crc_a, crc_ab, length_b);
}
crc32c_t MemcpyCrc32c(void* dest, const void* src, size_t count,
crc32c_t initial_crc) {
return static_cast<crc32c_t>(
crc_internal::Crc32CAndCopy(dest, src, count, initial_crc, false));
}
// Remove a Suffix of given size from a buffer
//
// Given a CRC32C of an existing buffer, `full_string_crc`; the CRC32C of a
// suffix of that buffer to remove, `suffix_crc`; and suffix buffer's length,
// `suffix_len` return the CRC32C of the buffer with suffix removed
//
// This operation has a runtime cost of O(log(`suffix_len`))
crc32c_t RemoveCrc32cSuffix(crc32c_t full_string_crc, crc32c_t suffix_crc,
size_t suffix_len) {
crc32c_t crc_with_suffix_zeroed =
suffix_crc ^ full_string_crc ^
ExtendCrc32cByZeroes(ToCrc32c(0), suffix_len);
return crc_internal::UnextendCrc32cByZeroes(
crc_with_suffix_zeroed, suffix_len);
}
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,176 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// -----------------------------------------------------------------------------
// File: crc32c.h
// -----------------------------------------------------------------------------
//
// This header file defines the API for computing CRC32C values as checksums
// for arbitrary sequences of bytes provided as a string buffer.
//
// The API includes the basic functions for computing such CRC32C values and
// some utility functions for performing more efficient mathematical
// computations using an existing checksum.
#ifndef ABSL_CRC_CRC32C_H_
#define ABSL_CRC_CRC32C_H_
#include <cstdint>
#include <iostream>
#include <ostream>
#include "absl/crc/internal/crc32c_inline.h"
#include "absl/strings/string_view.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
// crc32c_t
//-----------------------------------------------------------------------------
// `crc32c_t` defines a strongly typed integer type for holding a CRC32C value.
enum class crc32c_t : uint32_t {};
// ToCrc32c()
//
// Converts a uint32_t value to crc32c_t. This API is necessary in C++14
// and earlier. Code targeting C++17-or-later can instead use `crc32c_t{n}`.
inline crc32c_t ToCrc32c(uint32_t n) {
return static_cast<crc32c_t>(n);
}
// operator^
//
// Performs a bitwise XOR on two CRC32C values
inline crc32c_t operator^(crc32c_t lhs, crc32c_t rhs) {
const auto lhs_int = static_cast<uint32_t>(lhs);
const auto rhs_int = static_cast<uint32_t>(rhs);
return ToCrc32c(lhs_int ^ rhs_int);
}
namespace crc_internal {
// Non-inline code path for `absl::ExtendCrc32c()`. Do not call directly.
// Call `absl::ExtendCrc32c()` (defined below) instead.
crc32c_t ExtendCrc32cInternal(crc32c_t initial_crc,
absl::string_view buf_to_add);
} // namespace crc_internal
// -----------------------------------------------------------------------------
// CRC32C Computation Functions
// -----------------------------------------------------------------------------
// ComputeCrc32c()
//
// Returns the CRC32C value of the provided string.
crc32c_t ComputeCrc32c(absl::string_view buf);
// ExtendCrc32c()
//
// Computes a CRC32C value from an `initial_crc` CRC32C value including the
// `buf_to_add` bytes of an additional buffer. Using this function is more
// efficient than computing a CRC32C value for the combined buffer from
// scratch.
//
// Note: `ExtendCrc32c` with an initial_crc of 0 is equivalent to
// `ComputeCrc32c`.
//
// This operation has a runtime cost of O(`buf_to_add.size()`)
inline crc32c_t ExtendCrc32c(crc32c_t initial_crc,
absl::string_view buf_to_add) {
// Approximately 75% of calls have size <= 64.
if (buf_to_add.size() <= 64) {
uint32_t crc = static_cast<uint32_t>(initial_crc);
if (crc_internal::ExtendCrc32cInline(&crc, buf_to_add.data(),
buf_to_add.size())) {
return ToCrc32c(crc);
}
}
return crc_internal::ExtendCrc32cInternal(initial_crc, buf_to_add);
}
// ExtendCrc32cByZeroes()
//
// Computes a CRC32C value for a buffer with an `initial_crc` CRC32C value,
// where `length` bytes with a value of 0 are appended to the buffer. Using this
// function is more efficient than computing a CRC32C value for the combined
// buffer from scratch.
//
// This operation has a runtime cost of O(log(`length`))
crc32c_t ExtendCrc32cByZeroes(crc32c_t initial_crc, size_t length);
// MemcpyCrc32c()
//
// Copies `src` to `dest` using `memcpy()` semantics, returning the CRC32C
// value of the copied buffer.
//
// Using `MemcpyCrc32c()` is potentially faster than performing the `memcpy()`
// and `ComputeCrc32c()` operations separately.
crc32c_t MemcpyCrc32c(void* dest, const void* src, size_t count,
crc32c_t initial_crc = ToCrc32c(0));
// -----------------------------------------------------------------------------
// CRC32C Arithmetic Functions
// -----------------------------------------------------------------------------
// The following functions perform arithmetic on CRC32C values, which are
// generally more efficient than recalculating any given result's CRC32C value.
// ConcatCrc32c()
//
// Calculates the CRC32C value of two buffers with known CRC32C values
// concatenated together.
//
// Given a buffer with CRC32C value `crc1` and a buffer with
// CRC32C value `crc2` and length, `crc2_length`, returns the CRC32C value of
// the concatenation of these two buffers.
//
// This operation has a runtime cost of O(log(`crc2_length`)).
crc32c_t ConcatCrc32c(crc32c_t crc1, crc32c_t crc2, size_t crc2_length);
// RemoveCrc32cPrefix()
//
// Calculates the CRC32C value of an existing buffer with a series of bytes
// (the prefix) removed from the beginning of that buffer.
//
// Given the CRC32C value of an existing buffer, `full_string_crc`; The CRC32C
// value of a prefix of that buffer, `prefix_crc`; and the length of the buffer
// with the prefix removed, `remaining_string_length` , return the CRC32C
// value of the buffer with the prefix removed.
//
// This operation has a runtime cost of O(log(`remaining_string_length`)).
crc32c_t RemoveCrc32cPrefix(crc32c_t prefix_crc, crc32c_t full_string_crc,
size_t remaining_string_length);
// RemoveCrc32cSuffix()
//
// Calculates the CRC32C value of an existing buffer with a series of bytes
// (the suffix) removed from the end of that buffer.
//
// Given a CRC32C value of an existing buffer `full_string_crc`, the CRC32C
// value of the suffix to remove `suffix_crc`, and the length of that suffix
// `suffix_len`, returns the CRC32C value of the buffer with suffix removed.
//
// This operation has a runtime cost of O(log(`suffix_len`))
crc32c_t RemoveCrc32cSuffix(crc32c_t full_string_crc, crc32c_t suffix_crc,
size_t suffix_length);
// operator<<
//
// Streams the CRC32C value `crc` to the stream `os`.
inline std::ostream& operator<<(std::ostream& os, crc32c_t crc) {
return os << static_cast<uint32_t>(crc);
}
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_CRC32C_H_

@ -0,0 +1,162 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32c.h"
#include "absl/memory/memory.h"
#include "benchmark/benchmark.h"
namespace {
std::string TestString(size_t len) {
std::string result;
result.reserve(len);
for (size_t i = 0; i < len; ++i) {
result.push_back(static_cast<char>(i % 256));
}
return result;
}
void BM_Calculate(benchmark::State& state) {
int len = state.range(0);
std::string data = TestString(len);
for (auto s : state) {
benchmark::DoNotOptimize(data);
absl::crc32c_t crc = absl::ComputeCrc32c(data);
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_Calculate)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000);
void BM_Extend(benchmark::State& state) {
int len = state.range(0);
std::string extension = TestString(len);
absl::crc32c_t base = absl::ToCrc32c(0xC99465AA); // CRC32C of "Hello World"
for (auto s : state) {
benchmark::DoNotOptimize(base);
benchmark::DoNotOptimize(extension);
absl::crc32c_t crc = absl::ExtendCrc32c(base, extension);
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000);
void BM_ExtendByZeroes(benchmark::State& state) {
absl::crc32c_t base = absl::ToCrc32c(0xC99465AA); // CRC32C of "Hello World"
int num_zeroes = state.range(0);
for (auto s : state) {
benchmark::DoNotOptimize(base);
absl::crc32c_t crc = absl::ExtendCrc32cByZeroes(base, num_zeroes);
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_ExtendByZeroes)
->RangeMultiplier(10)
->Range(1, 1000000)
->RangeMultiplier(32)
->Range(1, 1 << 20);
void BM_UnextendByZeroes(benchmark::State& state) {
absl::crc32c_t base = absl::ToCrc32c(0xdeadbeef);
int num_zeroes = state.range(0);
for (auto s : state) {
benchmark::DoNotOptimize(base);
absl::crc32c_t crc =
absl::crc_internal::UnextendCrc32cByZeroes(base, num_zeroes);
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_UnextendByZeroes)
->RangeMultiplier(10)
->Range(1, 1000000)
->RangeMultiplier(32)
->Range(1, 1 << 20);
void BM_Concat(benchmark::State& state) {
int string_b_len = state.range(0);
std::string string_b = TestString(string_b_len);
// CRC32C of "Hello World"
absl::crc32c_t crc_a = absl::ToCrc32c(0xC99465AA);
absl::crc32c_t crc_b = absl::ComputeCrc32c(string_b);
for (auto s : state) {
benchmark::DoNotOptimize(crc_a);
benchmark::DoNotOptimize(crc_b);
benchmark::DoNotOptimize(string_b_len);
absl::crc32c_t crc_ab = absl::ConcatCrc32c(crc_a, crc_b, string_b_len);
benchmark::DoNotOptimize(crc_ab);
}
}
BENCHMARK(BM_Concat)
->RangeMultiplier(10)
->Range(1, 1000000)
->RangeMultiplier(32)
->Range(1, 1 << 20);
void BM_Memcpy(benchmark::State& state) {
int string_len = state.range(0);
std::string source = TestString(string_len);
auto dest = absl::make_unique<char[]>(string_len);
for (auto s : state) {
benchmark::DoNotOptimize(source);
absl::crc32c_t crc =
absl::MemcpyCrc32c(dest.get(), source.data(), source.size());
benchmark::DoNotOptimize(crc);
benchmark::DoNotOptimize(dest);
benchmark::DoNotOptimize(dest.get());
benchmark::DoNotOptimize(dest[0]);
}
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
state.range(0));
}
BENCHMARK(BM_Memcpy)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000);
void BM_RemoveSuffix(benchmark::State& state) {
int full_string_len = state.range(0);
int suffix_len = state.range(1);
std::string full_string = TestString(full_string_len);
std::string suffix = full_string.substr(
full_string_len - suffix_len, full_string_len);
absl::crc32c_t full_string_crc = absl::ComputeCrc32c(full_string);
absl::crc32c_t suffix_crc = absl::ComputeCrc32c(suffix);
for (auto s : state) {
benchmark::DoNotOptimize(full_string_crc);
benchmark::DoNotOptimize(suffix_crc);
benchmark::DoNotOptimize(suffix_len);
absl::crc32c_t crc = absl::RemoveCrc32cSuffix(full_string_crc, suffix_crc,
suffix_len);
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_RemoveSuffix)
->ArgPair(1, 1)
->ArgPair(100, 10)
->ArgPair(100, 100)
->ArgPair(10000, 1)
->ArgPair(10000, 100)
->ArgPair(10000, 10000)
->ArgPair(500000, 1)
->ArgPair(500000, 100)
->ArgPair(500000, 10000)
->ArgPair(500000, 500000);
} // namespace

@ -0,0 +1,186 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/crc/crc32c.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <string>
#include "gtest/gtest.h"
#include "absl/crc/internal/crc32c.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
namespace {
TEST(CRC32C, RFC3720) {
// Test the results of the vectors from
// https://www.rfc-editor.org/rfc/rfc3720#appendix-B.4
char data[32];
// 32 bytes of ones.
memset(data, 0, sizeof(data));
EXPECT_EQ(absl::ComputeCrc32c(absl::string_view(data, sizeof(data))),
absl::ToCrc32c(0x8a9136aa));
// 32 bytes of ones.
memset(data, 0xff, sizeof(data));
EXPECT_EQ(absl::ComputeCrc32c(absl::string_view(data, sizeof(data))),
absl::ToCrc32c(0x62a8ab43));
// 32 incrementing bytes.
for (int i = 0; i < 32; ++i) data[i] = static_cast<char>(i);
EXPECT_EQ(absl::ComputeCrc32c(absl::string_view(data, sizeof(data))),
absl::ToCrc32c(0x46dd794e));
// 32 decrementing bytes.
for (int i = 0; i < 32; ++i) data[i] = static_cast<char>(31 - i);
EXPECT_EQ(absl::ComputeCrc32c(absl::string_view(data, sizeof(data))),
absl::ToCrc32c(0x113fdb5c));
// An iSCSI - SCSI Read (10) Command PDU.
constexpr uint8_t cmd[48] = {
0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
EXPECT_EQ(absl::ComputeCrc32c(absl::string_view(
reinterpret_cast<const char*>(cmd), sizeof(cmd))),
absl::ToCrc32c(0xd9963a56));
}
std::string TestString(size_t len) {
std::string result;
result.reserve(len);
for (size_t i = 0; i < len; ++i) {
result.push_back(static_cast<char>(i % 256));
}
return result;
}
TEST(CRC32C, Compute) {
EXPECT_EQ(absl::ComputeCrc32c(""), absl::ToCrc32c(0));
EXPECT_EQ(absl::ComputeCrc32c("hello world"), absl::ToCrc32c(0xc99465aa));
}
TEST(CRC32C, Extend) {
uint32_t base = 0xC99465AA; // CRC32C of "Hello World"
std::string extension = "Extension String";
EXPECT_EQ(
absl::ExtendCrc32c(absl::ToCrc32c(base), extension),
absl::ToCrc32c(0xD2F65090)); // CRC32C of "Hello WorldExtension String"
}
TEST(CRC32C, ExtendByZeroes) {
std::string base = "hello world";
absl::crc32c_t base_crc = absl::ToCrc32c(0xc99465aa);
for (const size_t extend_by : {100, 10000, 100000}) {
SCOPED_TRACE(extend_by);
absl::crc32c_t crc2 = absl::ExtendCrc32cByZeroes(base_crc, extend_by);
EXPECT_EQ(crc2, absl::ComputeCrc32c(base + std::string(extend_by, '\0')));
}
}
TEST(CRC32C, UnextendByZeroes) {
for (auto seed_crc : {absl::ToCrc32c(0), absl::ToCrc32c(0xc99465aa)}) {
SCOPED_TRACE(seed_crc);
for (const size_t size_1 : {2, 200, 20000, 200000, 20000000}) {
for (const size_t size_2 : {0, 100, 10000, 100000, 10000000}) {
size_t extend_size = std::max(size_1, size_2);
size_t unextend_size = std::min(size_1, size_2);
SCOPED_TRACE(extend_size);
SCOPED_TRACE(unextend_size);
// Extending by A zeroes an unextending by B<A zeros should be identical
// to extending by A-B zeroes.
absl::crc32c_t crc1 = seed_crc;
crc1 = absl::ExtendCrc32cByZeroes(crc1, extend_size);
crc1 = absl::crc_internal::UnextendCrc32cByZeroes(crc1, unextend_size);
absl::crc32c_t crc2 = seed_crc;
crc2 = absl::ExtendCrc32cByZeroes(crc2, extend_size - unextend_size);
EXPECT_EQ(crc1, crc2);
}
}
}
for (const size_t size : {0, 1, 100, 10000}) {
SCOPED_TRACE(size);
std::string string_before = TestString(size);
std::string string_after = string_before + std::string(size, '\0');
absl::crc32c_t crc_before = absl::ComputeCrc32c(string_before);
absl::crc32c_t crc_after = absl::ComputeCrc32c(string_after);
EXPECT_EQ(crc_before,
absl::crc_internal::UnextendCrc32cByZeroes(crc_after, size));
}
}
TEST(CRC32C, Concat) {
std::string hello = "Hello, ";
std::string world = "world!";
std::string hello_world = absl::StrCat(hello, world);
absl::crc32c_t crc_a = absl::ComputeCrc32c(hello);
absl::crc32c_t crc_b = absl::ComputeCrc32c(world);
absl::crc32c_t crc_ab = absl::ComputeCrc32c(hello_world);
EXPECT_EQ(absl::ConcatCrc32c(crc_a, crc_b, world.size()), crc_ab);
}
TEST(CRC32C, Memcpy) {
for (size_t bytes : {0, 1, 20, 500, 100000}) {
SCOPED_TRACE(bytes);
std::string sample_string = TestString(bytes);
std::string target_buffer = std::string(bytes, '\0');
absl::crc32c_t memcpy_crc =
absl::MemcpyCrc32c(&(target_buffer[0]), sample_string.data(), bytes);
absl::crc32c_t compute_crc = absl::ComputeCrc32c(sample_string);
EXPECT_EQ(memcpy_crc, compute_crc);
EXPECT_EQ(sample_string, target_buffer);
}
}
TEST(CRC32C, RemovePrefix) {
std::string hello = "Hello, ";
std::string world = "world!";
std::string hello_world = absl::StrCat(hello, world);
absl::crc32c_t crc_a = absl::ComputeCrc32c(hello);
absl::crc32c_t crc_b = absl::ComputeCrc32c(world);
absl::crc32c_t crc_ab = absl::ComputeCrc32c(hello_world);
EXPECT_EQ(absl::RemoveCrc32cPrefix(crc_a, crc_ab, world.size()), crc_b);
}
TEST(CRC32C, RemoveSuffix) {
std::string hello = "Hello, ";
std::string world = "world!";
std::string hello_world = absl::StrCat(hello, world);
absl::crc32c_t crc_a = absl::ComputeCrc32c(hello);
absl::crc32c_t crc_b = absl::ComputeCrc32c(world);
absl::crc32c_t crc_ab = absl::ComputeCrc32c(hello_world);
EXPECT_EQ(absl::RemoveCrc32cSuffix(crc_ab, crc_b, world.size()), crc_a);
}
} // namespace

@ -0,0 +1,247 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/crc/internal/cpu_detect.h"
#include <cstdint>
#include <string>
#include "absl/base/config.h"
#if defined(__aarch64__) && defined(__linux__)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
#if defined(__x86_64__)
// Inline cpuid instruction. %rbx is occasionally used to address stack
// variables in presence of dynamic allocas. Preserve the %rbx register via
// %rdi to work around a clang bug https://bugs.llvm.org/show_bug.cgi?id=17907
// (%rbx in an output constraint is not considered a clobbered register).
//
// a_inp and c_inp are the input parameters eax and ecx of the CPUID
// instruction.
// a, b, c, and d contain the contents of eax, ebx, ecx, and edx as returned by
// the CPUID instruction
#define ABSL_INTERNAL_GETCPUID(a, b, c, d, a_inp, c_inp) \
asm("mov %%rbx, %%rdi\n" \
"cpuid\n" \
"xchg %%rdi, %%rbx\n" \
: "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
: "a"(a_inp), "2"(c_inp))
namespace {
enum class Vendor {
kUnknown,
kIntel,
kAmd,
};
Vendor GetVendor() {
uint32_t eax, ebx, ecx, edx;
// Get vendor string (issue CPUID with eax = 0)
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 0, 0);
std::string vendor;
vendor.append(reinterpret_cast<char*>(&ebx), 4);
vendor.append(reinterpret_cast<char*>(&edx), 4);
vendor.append(reinterpret_cast<char*>(&ecx), 4);
if (vendor == "GenuineIntel") {
return Vendor::kIntel;
} else if (vendor == "AuthenticAmd") {
return Vendor::kAmd;
} else {
return Vendor::kUnknown;
}
}
CpuType GetIntelCpuType() {
uint32_t eax, ebx, ecx, edx;
// to get general information and extended features we send eax = 1 and
// ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
// (See Intel 64 and IA-32 Architectures Software Developer's Manual
// Volume 2A: Instruction Set Reference, A-M CPUID).
// https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0);
// Response in eax bits as follows:
// 0-3 (stepping id)
// 4-7 (model number),
// 8-11 (family code),
// 12-13 (processor type),
// 16-19 (extended model)
// 20-27 (extended family)
int family = (eax >> 8) & 0x0f;
int model_num = (eax >> 4) & 0x0f;
int ext_family = (eax >> 20) & 0xff;
int ext_model_num = (eax >> 16) & 0x0f;
int brand_id = ebx & 0xff;
// Process the extended family and model info if necessary
if (family == 0x0f) {
family += ext_family;
}
if (family == 0x0f || family == 0x6) {
model_num += (ext_model_num << 4);
}
switch (brand_id) {
case 0: // no brand ID, so parse CPU family/model
switch (family) {
case 6: // Most PentiumIII processors are in this category
switch (model_num) {
case 0x2c: // Westmere: Gulftown
return CpuType::kIntelWestmere;
case 0x2d: // Sandybridge
return CpuType::kIntelSandybridge;
case 0x3e: // Ivybridge
return CpuType::kIntelIvybridge;
case 0x3c: // Haswell (client)
case 0x3f: // Haswell
return CpuType::kIntelHaswell;
case 0x4f: // Broadwell
case 0x56: // BroadwellDE
return CpuType::kIntelBroadwell;
case 0x55: // Skylake Xeon
if ((eax & 0x0f) < 5) { // stepping < 5 is skylake
return CpuType::kIntelSkylakeXeon;
} else { // stepping >= 5 is cascadelake
return CpuType::kIntelCascadelakeXeon;
}
case 0x5e: // Skylake (client)
return CpuType::kIntelSkylake;
default:
return CpuType::kUnknown;
}
default:
return CpuType::kUnknown;
}
default:
return CpuType::kUnknown;
}
}
CpuType GetAmdCpuType() {
uint32_t eax, ebx, ecx, edx;
// to get general information and extended features we send eax = 1 and
// ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx.
// (See Intel 64 and IA-32 Architectures Software Developer's Manual
// Volume 2A: Instruction Set Reference, A-M CPUID).
ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0);
// Response in eax bits as follows:
// 0-3 (stepping id)
// 4-7 (model number),
// 8-11 (family code),
// 12-13 (processor type),
// 16-19 (extended model)
// 20-27 (extended family)
int family = (eax >> 8) & 0x0f;
int model_num = (eax >> 4) & 0x0f;
int ext_family = (eax >> 20) & 0xff;
int ext_model_num = (eax >> 16) & 0x0f;
if (family == 0x0f) {
family += ext_family;
model_num += (ext_model_num << 4);
}
switch (family) {
case 0x17:
switch (model_num) {
case 0x0: // Stepping Ax
case 0x1: // Stepping Bx
return CpuType::kAmdNaples;
case 0x30: // Stepping Ax
case 0x31: // Stepping Bx
return CpuType::kAmdRome;
default:
return CpuType::kUnknown;
}
break;
case 0x19:
switch (model_num) {
case 0x1: // Stepping B0
return CpuType::kAmdMilan;
default:
return CpuType::kUnknown;
}
break;
default:
return CpuType::kUnknown;
}
}
} // namespace
CpuType GetCpuType() {
switch (GetVendor()) {
case Vendor::kIntel:
return GetIntelCpuType();
case Vendor::kAmd:
return GetAmdCpuType();
default:
return CpuType::kUnknown;
}
}
#elif defined(__aarch64__) && defined(__linux__)
#define ABSL_INTERNAL_AARCH64_ID_REG_READ(id, val) \
asm("mrs %0, " #id : "=r"(val))
CpuType GetCpuType() {
// MIDR_EL1 is not visible to EL0, however the access will be emulated by
// linux if AT_HWCAP has HWCAP_CPUID set.
//
// This method will be unreliable on heterogeneous computing systems (ex:
// big.LITTLE) since the value of MIDR_EL1 will change based on the calling
// thread.
uint64_t hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_CPUID) {
uint64_t midr = 0;
ABSL_INTERNAL_AARCH64_ID_REG_READ(MIDR_EL1, midr);
uint32_t implementer = (midr >> 24) & 0xff;
uint32_t part_number = (midr >> 4) & 0xfff;
if (implementer == 0x41 && part_number == 0xd0c) {
return CpuType::kArmNeoverseN1;
}
}
return CpuType::kUnknown;
}
bool SupportsArmCRC32PMULL() {
uint64_t hwcaps = getauxval(AT_HWCAP);
return (hwcaps & HWCAP_CRC32) && (hwcaps & HWCAP_PMULL);
}
#else
CpuType GetCpuType() { return CpuType::kUnknown; }
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,59 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CPU_DETECT_H_
#define ABSL_CRC_INTERNAL_CPU_DETECT_H_
#include "absl/base/config.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
// Enumeration of architectures that we have special-case tuning parameters for.
// This set may change over time.
enum class CpuType {
kUnknown,
kIntelHaswell,
kAmdRome,
kAmdNaples,
kAmdMilan,
kIntelCascadelakeXeon,
kIntelSkylakeXeon,
kIntelBroadwell,
kIntelSkylake,
kIntelIvybridge,
kIntelSandybridge,
kIntelWestmere,
kArmNeoverseN1,
};
// Returns the type of host CPU this code is running on. Returns kUnknown if
// the host CPU is of unknown type, or if detection otherwise fails.
CpuType GetCpuType();
#if defined(__aarch64__)
// Returns whether the host CPU supports the CPU features needed for our
// accelerated implementations. The CpuTypes enumerated above apart from
// kUnknown support the required features. On unknown CPUs, we can use
// this to see if it's safe to use hardware acceleration, though without any
// tuning.
bool SupportsArmCRC32PMULL();
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CPU_DETECT_H_

@ -0,0 +1,468 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implementation of CRCs (aka Rabin Fingerprints).
// Treats the input as a polynomial with coefficients in Z(2),
// and finds the remainder when divided by an irreducible polynomial
// of the appropriate length.
// It handles all CRC sizes from 8 to 128 bits.
// It's somewhat complicated by having separate implementations optimized for
// CRC's <=32 bits, <= 64 bits, and <= 128 bits.
// The input string is prefixed with a "1" bit, and has "degree" "0" bits
// appended to it before the remainder is found. This ensures that
// short strings are scrambled somewhat and that strings consisting
// of all nulls have a non-zero CRC.
//
// Uses the "interleaved word-by-word" method from
// "Everything we know about CRC but afraid to forget" by Andrew Kadatch
// and Bob Jenkins,
// http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
//
// The idea is to compute kStride CRCs simultaneously, allowing the
// processor to more effectively use multiple execution units. Each of
// the CRCs is calculated on one word of data followed by kStride - 1
// words of zeroes; the CRC starting points are staggered by one word.
// Assuming a stride of 4 with data words "ABCDABCDABCD", the first
// CRC is over A000A000A, the second over 0B000B000B, and so on.
// The CRC of the whole data is then calculated by properly aligning the
// CRCs by appending zeroes until the data lengths agree then XORing
// the CRCs.
#include "absl/crc/internal/crc.h"
#include <cstdint>
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/internal/raw_logging.h"
#include "absl/crc/internal/crc_internal.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
namespace {
// Constants
#if defined(__i386__) || defined(__x86_64__)
constexpr bool kNeedAlignedLoads = false;
#else
constexpr bool kNeedAlignedLoads = true;
#endif
// We express the number of zeroes as a number in base ZEROES_BASE. By
// pre-computing the zero extensions for all possible components of such an
// expression (numbers in a form a*ZEROES_BASE**b), we can calculate the
// resulting extension by multiplying the extensions for individual components
// using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of
// zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries.
constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE)
constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2
constexpr uint32_t kCrc32cPoly = 0x82f63b78;
uint32_t ReverseBits(uint32_t bits) {
bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1;
bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2;
bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4;
return absl::gbswap_32(bits);
}
// Polynomial long multiplication mod the polynomial of degree 32.
void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) {
uint32_t l = *val;
uint32_t result = 0;
auto onebit = uint32_t{0x80000000u};
for (uint32_t one = onebit; one != 0; one >>= 1) {
if ((l & one) != 0) {
result ^= m;
}
if (m & 1) {
m = (m >> 1) ^ poly;
} else {
m >>= 1;
}
}
*val = result;
}
} // namespace
void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size,
Uint32By256* t) {
for (int j = 0; j != word_size; j++) { // for each byte of extension....
t[j][0] = 0; // a zero has no effect
for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2
if (j == 0 && i == 128) {
t[j][i] = last; // top bit in last byte is given
} else {
// each successive power of two is derived from the previous
// one, either in this table, or the last table
uint32_t pred;
if (i == 128) {
pred = t[j - 1][1];
} else {
pred = t[j][i << 1];
}
// Advance the CRC by one bit (multiply by X, and take remainder
// through one step of polynomial long division)
if (pred & 1) {
t[j][i] = (pred >> 1) ^ poly;
} else {
t[j][i] = pred >> 1;
}
}
}
// CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b)
// so we can make all the tables for non-powers of two by
// xoring previously created entries.
for (int i = 2; i != 256; i <<= 1) {
for (int k = i + 1; k != (i << 1); k++) {
t[j][k] = t[j][i] ^ t[j][k - i];
}
}
}
}
int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) {
uint32_t inc = 1;
inc <<= 31;
// Extend by one zero bit. We know degree > 1 so (inc & 1) == 0.
inc >>= 1;
// Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte.
for (int i = 0; i < 3; ++i) {
PolyMultiply(&inc, inc, poly);
}
int j = 0;
for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) {
// Every entry in the table adds an additional inc_len zeroes.
uint32_t v = inc;
for (int a = 1; a != ZEROES_BASE; a++) {
t[0][j] = v;
PolyMultiply(&v, inc, poly);
j++;
}
inc = v;
}
ABSL_RAW_CHECK(j <= 256, "");
return j;
}
// Internal version of the "constructor".
CRCImpl* CRCImpl::NewInternal() {
// Find an accelearated implementation first.
CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined();
// Fall back to generic implementions if no acceleration is available.
if (result == nullptr) {
result = new CRC32();
}
result->InitTables();
return result;
}
// The CRC of the empty string is always the CRC polynomial itself.
void CRCImpl::Empty(uint32_t* crc) const { *crc = kCrc32cPoly; }
// The 32-bit implementation
void CRC32::InitTables() {
// Compute the table for extending a CRC by one byte.
Uint32By256* t = new Uint32By256[4];
FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t);
for (int i = 0; i != 256; i++) {
this->table0_[i] = t[0][i];
}
// Construct a table for updating the CRC by 4 bytes data followed by
// 12 bytes of zeroes.
//
// Note: the data word size could be larger than the CRC size; it might
// be slightly faster to use a 64-bit data word, but doing so doubles the
// table size.
uint32_t last = kCrc32cPoly;
const size_t size = 12;
for (size_t i = 0; i < size; ++i) {
last = (last >> 8) ^ this->table0_[last & 0xff];
}
FillWordTable(kCrc32cPoly, last, 4, t);
for (size_t b = 0; b < 4; ++b) {
for (int i = 0; i < 256; ++i) {
this->table_[b][i] = t[b][i];
}
}
int j = FillZeroesTable(kCrc32cPoly, t);
ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), "");
for (int i = 0; i < j; i++) {
this->zeroes_[i] = t[0][i];
}
delete[] t;
// Build up tables for _reversing_ the operation of doing CRC operations on
// zero bytes.
// In C++, extending `crc` by a single zero bit is done by the following:
// (A) bool low_bit_set = (crc & 1);
// crc >>= 1;
// if (low_bit_set) crc ^= kCrc32cPoly;
//
// In particular note that the high bit of `crc` after this operation will be
// set if and only if the low bit of `crc` was set before it. This means that
// no information is lost, and the operation can be reversed, as follows:
// (B) bool high_bit_set = (crc & 0x80000000u);
// if (high_bit_set) crc ^= kCrc32cPoly;
// crc <<= 1;
// if (high_bit_set) crc ^= 1;
//
// Or, equivalently:
// (C) bool high_bit_set = (crc & 0x80000000u);
// crc <<= 1;
// if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1);
//
// The last observation is, if we store our checksums in variable `rcrc`,
// with order of the bits reversed, the inverse operation becomes:
// (D) bool low_bit_set = (rcrc & 1);
// rcrc >>= 1;
// if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1)
//
// This is the same algorithm (A) that we started with, only with a different
// polynomial bit pattern. This means that by building up our tables with
// this alternate polynomial, we can apply the CRC algorithms to a
// bit-reversed CRC checksum to perform inverse zero-extension.
const uint32_t kCrc32cUnextendPoly =
ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1));
FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_);
j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_);
ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)),
"");
}
void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
const uint8_t* p = static_cast<const uint8_t*>(bytes);
const uint8_t* e = p + length;
uint32_t l = *crc;
auto step_one_byte = [this, &p, &l] () {
int c = (l & 0xff) ^ *p++;
l = this->table0_[c] ^ (l >> 8);
};
if (kNeedAlignedLoads) {
// point x at first 4-byte aligned byte in string. this might be past the
// end of the string.
const uint8_t* x = RoundUp<4>(p);
if (x <= e) {
// Process bytes until finished or p is 4-byte aligned
while (p != x) {
step_one_byte();
}
}
}
const size_t kSwathSize = 16;
if (static_cast<size_t>(e - p) >= kSwathSize) {
// Load one swath of data into the operating buffers.
uint32_t buf0 = absl::little_endian::Load32(p) ^ l;
uint32_t buf1 = absl::little_endian::Load32(p + 4);
uint32_t buf2 = absl::little_endian::Load32(p + 8);
uint32_t buf3 = absl::little_endian::Load32(p + 12);
p += kSwathSize;
// Increment a CRC value by a "swath"; this combines the four bytes
// starting at `ptr` and twelve zero bytes, so that four CRCs can be
// built incrementally and combined at the end.
const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) {
return absl::little_endian::Load32(ptr) ^
this->table_[3][crc_in & 0xff] ^
this->table_[2][(crc_in >> 8) & 0xff] ^
this->table_[1][(crc_in >> 16) & 0xff] ^
this->table_[0][crc_in >> 24];
};
// Run one CRC calculation step over all swaths in one 16-byte stride
const auto step_stride = [&]() {
buf0 = step_swath(buf0, p);
buf1 = step_swath(buf1, p + 4);
buf2 = step_swath(buf2, p + 8);
buf3 = step_swath(buf3, p + 12);
p += 16;
};
// Process kStride interleaved swaths through the data in parallel.
while ((e - p) > kPrefetchHorizon) {
base_internal::PrefetchNta(
reinterpret_cast<const void*>(p + kPrefetchHorizon));
// Process 64 bytes at a time
step_stride();
step_stride();
step_stride();
step_stride();
}
while (static_cast<size_t>(e - p) >= kSwathSize) {
step_stride();
}
// Now advance one word at a time as far as possible. This isn't worth
// doing if we have word-advance tables.
while (static_cast<size_t>(e - p) >= 4) {
buf0 = step_swath(buf0, p);
uint32_t tmp = buf0;
buf0 = buf1;
buf1 = buf2;
buf2 = buf3;
buf3 = tmp;
p += 4;
}
// Combine the results from the different swaths. This is just a CRC
// on the data values in the bufX words.
auto combine_one_word = [this](uint32_t crc_in, uint32_t w) {
w ^= crc_in;
for (size_t i = 0; i < 4; ++i) {
w = (w >> 8) ^ this->table0_[w & 0xff];
}
return w;
};
l = combine_one_word(0, buf0);
l = combine_one_word(l, buf1);
l = combine_one_word(l, buf2);
l = combine_one_word(l, buf3);
}
// Process the last few bytes
while (p != e) {
step_one_byte();
}
*crc = l;
}
void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length,
const uint32_t zeroes_table[256],
const uint32_t poly_table[256]) const {
if (length != 0) {
uint32_t l = *crc;
// For each ZEROES_BASE_LG bits in length
// (after the low-order bits have been removed)
// we lookup the appropriate polynomial in the zeroes_ array
// and do a polynomial long multiplication (mod the CRC polynomial)
// to extend the CRC by the appropriate number of bits.
for (int i = 0; length != 0;
i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) {
int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits
if (c != 0) { // if they are not zero,
// multiply by entry in table
// Build a table to aid in multiplying 2 bits at a time.
// It takes too long to build tables for more bits.
uint64_t m = zeroes_table[c + i - 1];
m <<= 1;
uint64_t m2 = m << 1;
uint64_t mtab[4] = {0, m, m2, m2 ^ m};
// Do the multiply one byte at a time.
uint64_t result = 0;
for (int x = 0; x < 32; x += 8) {
// The carry-less multiply.
result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^
(mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6);
l >>= 8;
// Reduce modulo the polynomial
result = (result >> 8) ^ poly_table[result & 0xff];
}
l = static_cast<uint32_t>(result);
}
}
*crc = l;
}
}
void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const {
return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_);
}
void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const {
// See the comment in CRC32::InitTables() for an explanation of the algorithm
// below.
*crc = ReverseBits(*crc);
ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_);
*crc = ReverseBits(*crc);
}
void CRC32::Scramble(uint32_t* crc) const {
// Rotate by near half the word size plus 1. See the scramble comment in
// crc_internal.h for an explanation.
constexpr int scramble_rotate = (32 / 2) + 1;
*crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo),
32, scramble_rotate) &
MaskOfLength<uint32_t>(32);
}
void CRC32::Unscramble(uint32_t* crc) const {
constexpr int scramble_rotate = (32 / 2) + 1;
uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32,
32 - scramble_rotate);
*crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32);
}
// Constructor and destructor for base class CRC.
CRC::~CRC() {}
CRC::CRC() {}
// The "constructor" for a CRC32C with a standard polynomial.
CRC* CRC::Crc32c() {
static CRC* singleton = CRCImpl::NewInternal();
return singleton;
}
// This Concat implementation works for arbitrary polynomials.
void CRC::Concat(uint32_t* px, uint32_t y, size_t ylen) {
// https://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks
// The CRC of a message M is the remainder of polynomial divison modulo G,
// where the coefficient arithmetic is performed modulo 2 (so +/- are XOR):
// R(x) = M(x) x**n (mod G)
// (n is the degree of G)
// In practice, we use an initial value A and a bitmask B to get
// R = (A ^ B)x**|M| ^ Mx**n ^ B (mod G)
// If M is the concatenation of two strings S and T, and Z is the string of
// len(T) 0s, then the remainder CRC(ST) can be expressed as:
// R = (A ^ B)x**|ST| ^ STx**n ^ B
// = (A ^ B)x**|SZ| ^ SZx**n ^ B ^ Tx**n
// = CRC(SZ) ^ Tx**n
// CRC(Z) = (A ^ B)x**|T| ^ B
// CRC(T) = (A ^ B)x**|T| ^ Tx**n ^ B
// So R = CRC(SZ) ^ CRC(Z) ^ CRC(T)
//
// And further, since CRC(SZ) = Extend(CRC(S), Z),
// CRC(SZ) ^ CRC(Z) = Extend(CRC(S) ^ CRC(''), Z).
uint32_t z;
uint32_t t;
Empty(&z);
t = *px ^ z;
ExtendByZeroes(&t, ylen);
*px = t ^ y;
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,91 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC_H_
#define ABSL_CRC_INTERNAL_CRC_H_
#include <cstdint>
#include "absl/base/config.h"
// This class implements CRCs (aka Rabin Fingerprints).
// Treats the input as a polynomial with coefficients in Z(2),
// and finds the remainder when divided by an primitive polynomial
// of the appropriate length.
// A polynomial is represented by the bit pattern formed by its coefficients,
// but with the highest order bit not stored.
// The highest degree coefficient is stored in the lowest numbered bit
// in the lowest addressed byte. Thus, in what follows, the highest degree
// coefficient that is stored is in the low order bit of "lo" or "*lo".
// Hardware acceleration is used when available.
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
class CRC {
public:
virtual ~CRC();
// Place the CRC of the empty string in "*crc"
virtual void Empty(uint32_t* crc) const = 0;
// If "*crc" is the CRC of bytestring A, place the CRC of
// the bytestring formed from the concatenation of A and the "length"
// bytes at "bytes" into "*crc".
virtual void Extend(uint32_t* crc, const void* bytes,
size_t length) const = 0;
// Equivalent to Extend(crc, bytes, length) where "bytes"
// points to an array of "length" zero bytes.
virtual void ExtendByZeroes(uint32_t* crc, size_t length) const = 0;
// Inverse opration of ExtendByZeroes. If `crc` is the CRC value of a string
// ending in `length` zero bytes, this returns a CRC value of that string
// with those zero bytes removed.
virtual void UnextendByZeroes(uint32_t* crc, size_t length) const = 0;
// If *px is the CRC (as defined by *crc) of some string X,
// and y is the CRC of some string Y that is ylen bytes long, set
// *px to the CRC of the concatenation of X followed by Y.
virtual void Concat(uint32_t* px, uint32_t y, size_t ylen);
// Apply a non-linear transformation to "*crc" so that
// it is safe to CRC the result with the same polynomial without
// any reduction of error-detection ability in the outer CRC.
// Unscramble() performs the inverse transformation.
// It is strongly recommended that CRCs be scrambled before storage or
// transmission, and unscrambled at the other end before futher manipulation.
virtual void Scramble(uint32_t* crc) const = 0;
virtual void Unscramble(uint32_t* crc) const = 0;
// Crc32c() returns the singleton implementation of CRC for the CRC32C
// polynomial. Returns a handle that MUST NOT be destroyed with delete.
static CRC* Crc32c();
protected:
CRC(); // Clients may not call constructor; use Crc32c() instead.
private:
CRC(const CRC&) = delete;
CRC& operator=(const CRC&) = delete;
};
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC_H_

@ -0,0 +1,260 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
#define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
#include <cstdint>
#include "absl/base/config.h"
// -------------------------------------------------------------------------
// Many x86 and ARM machines have CRC acceleration hardware.
// We can do a faster version of Extend() on such machines.
// We define a translation layer for both x86 and ARM for the ease of use and
// most performance gains.
// We need CRC (part of sse4.2) and PCLMULQDQ instructions.
#if defined(__SSE4_2__) && defined(__PCLMUL__)
#include <x86intrin.h>
#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
#elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON)
#include <arm_acle.h>
#include <arm_neon.h>
#define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t;
#else
using V128 = __m128i;
#endif
// Starting with the initial value in |crc|, accumulates a CRC32 value for
// unsigned integers of different sizes.
uint32_t CRC32_u8(uint32_t crc, uint8_t v);
uint32_t CRC32_u16(uint32_t crc, uint16_t v);
uint32_t CRC32_u32(uint32_t crc, uint32_t v);
uint32_t CRC32_u64(uint32_t crc, uint64_t v);
// Loads 128 bits of integer data. |src| must be 16-byte aligned.
V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128* src);
// Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |l| and |r|.
V128 V128_PMulLow(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
V128 V128_PMul01(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
V128 V128_PMul10(const V128 l, const V128 r);
// Produces a XOR operation of |l| and |r|.
V128 V128_Xor(const V128 l, const V128 r);
// Produces an AND operation of |l| and |r|.
V128 V128_And(const V128 l, const V128 r);
// Sets two 64 bit integers to one 128 bit vector. The order is reverse.
// dst[63:0] := |r|
// dst[127:64] := |l|
V128 V128_From2x64(const uint64_t l, const uint64_t r);
// Shift |l| right by |imm| bytes while shifting in zeros.
template <int imm>
V128 V128_ShiftRight(const V128 l);
// Extracts a 32-bit integer from |l|, selected with |imm|.
template <int imm>
int V128_Extract32(const V128 l);
// Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l);
// Left-shifts packed 64-bit integers in l by r.
V128 V128_ShiftLeft64(const V128 l, const V128 r);
#endif
#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
return _mm_crc32_u8(crc, v);
}
inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
return _mm_crc32_u16(crc, v);
}
inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
return _mm_crc32_u32(crc, v);
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
return _mm_crc32_u64(crc, v);
}
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11);
}
inline V128 V128_PMulLow(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x00);
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x01);
}
inline V128 V128_PMul10(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x10);
}
inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
return _mm_set_epi64x(l, r);
}
template <int imm>
inline V128 V128_ShiftRight(const V128 l) {
return _mm_srli_si128(l, imm);
}
template <int imm>
inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm);
}
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
return _mm_sll_epi64(l, r);
}
#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
return __crc32ch(crc, v);
}
inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
return __crc32cw(crc, v);
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
return __crc32cd(crc, v);
}
inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
inline V128 V128_LoadU(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate
// such instructions and why it becomes so much worse.
inline V128 V128_PMulHi(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
: "=w"(res)
: "w"(l), "w"(r));
return res;
}
inline V128 V128_PMulLow(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
}
inline V128 V128_PMul10(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
}
inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
}
template <int imm>
inline V128 V128_ShiftRight(const V128 l) {
return vreinterpretq_u64_s8(
vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
}
template <int imm>
inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
}
inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
}
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
return vshlq_u64(l, r);
}
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_

@ -0,0 +1,39 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC32C_H_
#define ABSL_CRC_INTERNAL_CRC32C_H_
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
// Modifies a CRC32 value by removing `length` bytes with a value of 0 from
// the end of the string.
//
// This is the inverse operation of ExtendCrc32cByZeroes().
//
// This operation has a runtime cost of O(log(`length`))
//
// Internal implementation detail, exposed for testing only.
crc32c_t UnextendCrc32cByZeroes(crc32c_t initial_crc, size_t length);
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC32C_H_

@ -0,0 +1,72 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC32C_INLINE_H_
#define ABSL_CRC_INTERNAL_CRC32C_INLINE_H_
#include <cstdint>
#include "absl/base/config.h"
#include "absl/base/internal/endian.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
// CRC32C implementation optimized for small inputs.
// Either computes crc and return true, or if there is
// no hardware support does nothing and returns false.
inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
constexpr uint32_t kCrc32Xor = 0xffffffffU;
*crc ^= kCrc32Xor;
if (n & 1) {
*crc = CRC32_u8(*crc, *p);
n--;
p++;
}
if (n & 2) {
*crc = CRC32_u16(*crc, absl::little_endian::Load16(p));
n -= 2;
p += 2;
}
if (n & 4) {
*crc = CRC32_u32(*crc, absl::little_endian::Load32(p));
n -= 4;
p += 4;
}
while (n) {
*crc = CRC32_u64(*crc, absl::little_endian::Load64(p));
n -= 8;
p += 8;
}
*crc ^= kCrc32Xor;
return true;
#else
// No hardware support, signal the need to fallback.
static_cast<void>(crc);
static_cast<void>(p);
static_cast<void>(n);
return false;
#endif // defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) ||
// defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC32C_INLINE_H_

@ -0,0 +1,177 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC_INTERNAL_H_
#define ABSL_CRC_INTERNAL_CRC_INTERNAL_H_
#include <cstdint>
#include <memory>
#include <vector>
#include "absl/base/internal/raw_logging.h"
#include "absl/crc/internal/crc.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
// Prefetch constants used in some Extend() implementations
constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4; // Prefetch this far
static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len");
// We require the Scramble() function:
// - to be reversible (Unscramble() must exist)
// - to be non-linear in the polynomial's Galois field (so the CRC of a
// scrambled CRC is not linearly affected by the scrambled CRC, even if
// using the same polynomial)
// - not to be its own inverse. Preferably, if X=Scramble^N(X) and N!=0, then
// N is large.
// - to be fast.
// - not to change once defined.
// We introduce non-linearity in two ways:
// Addition of a constant.
// - The carries introduce non-linearity; we use bits of an irrational
// (phi) to make it unlikely that we introduce no carries.
// Rotate by a constant number of bits.
// - We use floor(degree/2)+1, which does not divide the degree, and
// splits the bits nearly evenly, which makes it less likely the
// halves will be the same or one will be all zeroes.
// We do both things to improve the chances of non-linearity in the face of
// bit patterns with low numbers of bits set, while still being fast.
// Below is the constant that we add. The bits are the first 128 bits of the
// fractional part of phi, with a 1 ored into the bottom bit to maximize the
// cycle length of repeated adds.
constexpr uint64_t kScrambleHi = (static_cast<uint64_t>(0x4f1bbcdcU) << 32) |
static_cast<uint64_t>(0xbfa53e0aU);
constexpr uint64_t kScrambleLo = (static_cast<uint64_t>(0xf9ce6030U) << 32) |
static_cast<uint64_t>(0x2e76e41bU);
class CRCImpl : public CRC { // Implemention of the abstract class CRC
public:
using Uint32By256 = uint32_t[256];
CRCImpl() {}
~CRCImpl() override = default;
// The internal version of CRC::New().
static CRCImpl* NewInternal();
void Empty(uint32_t* crc) const override;
// Fill in a table for updating a CRC by one word of 'word_size' bytes
// [last_lo, last_hi] contains the answer if the last bit in the word
// is set.
static void FillWordTable(uint32_t poly, uint32_t last, int word_size,
Uint32By256* t);
// Build the table for extending by zeroes, returning the number of entries.
// For a in {1, 2, ..., ZEROES_BASE-1}, b in {0, 1, 2, 3, ...},
// entry j=a-1+(ZEROES_BASE-1)*b
// contains a polynomial Pi such that multiplying
// a CRC by Pi mod P, where P is the CRC polynomial, is equivalent to
// appending a*2**(ZEROES_BASE_LG*b) zero bytes to the original string.
static int FillZeroesTable(uint32_t poly, Uint32By256* t);
virtual void InitTables() = 0;
private:
CRCImpl(const CRCImpl&) = delete;
CRCImpl& operator=(const CRCImpl&) = delete;
};
// This is the 32-bit implementation. It handles all sizes from 8 to 32.
class CRC32 : public CRCImpl {
public:
CRC32() {}
~CRC32() override {}
void Extend(uint32_t* crc, const void* bytes, size_t length) const override;
void ExtendByZeroes(uint32_t* crc, size_t length) const override;
void Scramble(uint32_t* crc) const override;
void Unscramble(uint32_t* crc) const override;
void UnextendByZeroes(uint32_t* crc, size_t length) const override;
void InitTables() override;
private:
// Common implementation guts for ExtendByZeroes and UnextendByZeroes().
//
// zeroes_table is a table as returned by FillZeroesTable(), containing
// polynomials representing CRCs of strings-of-zeros of various lenghts,
// and which can be combined by polynomial multiplication. poly_table is
// a table of CRC byte extension values. These tables are determined by
// the generator polynomial.
//
// These will be set to reverse_zeroes_ and reverse_table0_ for Unextend, and
// CRC32::zeroes_ and CRC32::table0_ for Extend.
void ExtendByZeroesImpl(uint32_t* crc, size_t length,
const uint32_t zeroes_table[256],
const uint32_t poly_table[256]) const;
uint32_t table0_[256]; // table of byte extensions
uint32_t zeroes_[256]; // table of zero extensions
// table of 4-byte extensions shifted by 12 bytes of zeroes
uint32_t table_[4][256];
// Reverse lookup tables, using the alternate polynomial used by
// UnextendByZeroes().
uint32_t reverse_table0_[256]; // table of reverse byte extensions
uint32_t reverse_zeroes_[256]; // table of reverse zero extensions
CRC32(const CRC32&) = delete;
CRC32& operator=(const CRC32&) = delete;
};
// Helpers
// Return a bit mask containing len 1-bits.
// Requires 0 < len <= sizeof(T)
template <typename T>
T MaskOfLength(int len) {
// shift 2 by len-1 rather than 1 by len because shifts of wordsize
// are undefined.
return (T(2) << (len - 1)) - 1;
}
// Rotate low-order "width" bits of "in" right by "r" bits,
// setting other bits in word to arbitrary values.
template <typename T>
T RotateRight(T in, int width, int r) {
return (in << (width - r)) | ((in >> r) & MaskOfLength<T>(width - r));
}
// RoundUp<N>(p) returns the lowest address >= p aligned to an N-byte
// boundary. Requires that N is a power of 2.
template <int alignment>
const uint8_t* RoundUp(const uint8_t* p) {
static_assert((alignment & (alignment - 1)) == 0, "alignment is not 2^n");
constexpr uintptr_t mask = alignment - 1;
const uintptr_t as_uintptr = reinterpret_cast<uintptr_t>(p);
return reinterpret_cast<const uint8_t*>((as_uintptr + mask) & ~mask);
}
// Return a newly created CRC32AcceleratedX86ARMCombined if we can use Intel's
// or ARM's CRC acceleration for a given polynomial. Return nullptr otherwise.
CRCImpl* TryNewCRC32AcceleratedX86ARMCombined();
// Return all possible hardware accelerated implementations. For testing only.
std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll();
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC_INTERNAL_H_

@ -0,0 +1,112 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC_MEMCPY_H_
#define ABSL_CRC_INTERNAL_CRC_MEMCPY_H_
#include <cstddef>
#include <memory>
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
class CrcMemcpyEngine {
public:
virtual ~CrcMemcpyEngine() = default;
virtual crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const = 0;
protected:
CrcMemcpyEngine() = default;
};
class CrcMemcpy {
public:
static crc32c_t CrcAndCopy(void* __restrict dst, const void* __restrict src,
std::size_t length,
crc32c_t initial_crc = ToCrc32c(0),
bool non_temporal = false) {
static const ArchSpecificEngines engines = GetArchSpecificEngines();
auto* engine = non_temporal ? engines.non_temporal : engines.temporal;
return engine->Compute(dst, src, length, initial_crc);
}
// For testing only: get an architecture-specific engine for tests.
static std::unique_ptr<CrcMemcpyEngine> GetTestEngine(int vector,
int integer);
private:
struct ArchSpecificEngines {
CrcMemcpyEngine* temporal;
CrcMemcpyEngine* non_temporal;
};
static ArchSpecificEngines GetArchSpecificEngines();
};
// Fallback CRC-memcpy engine.
class FallbackCrcMemcpyEngine : public CrcMemcpyEngine {
public:
FallbackCrcMemcpyEngine() = default;
FallbackCrcMemcpyEngine(const FallbackCrcMemcpyEngine&) = delete;
FallbackCrcMemcpyEngine operator=(const FallbackCrcMemcpyEngine&) = delete;
crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const override;
};
// CRC Non-Temporal-Memcpy engine.
class CrcNonTemporalMemcpyEngine : public CrcMemcpyEngine {
public:
CrcNonTemporalMemcpyEngine() = default;
CrcNonTemporalMemcpyEngine(const CrcNonTemporalMemcpyEngine&) = delete;
CrcNonTemporalMemcpyEngine operator=(const CrcNonTemporalMemcpyEngine&) =
delete;
crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const override;
};
// CRC Non-Temporal-Memcpy AVX engine.
class CrcNonTemporalMemcpyAVXEngine : public CrcMemcpyEngine {
public:
CrcNonTemporalMemcpyAVXEngine() = default;
CrcNonTemporalMemcpyAVXEngine(const CrcNonTemporalMemcpyAVXEngine&) = delete;
CrcNonTemporalMemcpyAVXEngine operator=(
const CrcNonTemporalMemcpyAVXEngine&) = delete;
crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const override;
};
// Copy source to destination and return the CRC32C of the data copied. If an
// accelerated version is available, use the accelerated version, otherwise use
// the generic fallback version.
inline crc32c_t Crc32CAndCopy(void* __restrict dst, const void* __restrict src,
std::size_t length,
crc32c_t initial_crc = ToCrc32c(0),
bool non_temporal = false) {
return CrcMemcpy::CrcAndCopy(dst, src, length, initial_crc, non_temporal);
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC_MEMCPY_H_

@ -0,0 +1,75 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstdint>
#include <memory>
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc_memcpy.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
const void* __restrict src,
std::size_t length,
crc32c_t initial_crc) const {
constexpr size_t kBlockSize = 8192;
absl::crc32c_t crc = initial_crc;
const char* src_bytes = reinterpret_cast<const char*>(src);
char* dst_bytes = reinterpret_cast<char*>(dst);
// Copy + CRC loop - run 8k chunks until we are out of full chunks. CRC
// then copy was found to be slightly more efficient in our test cases.
std::size_t offset = 0;
for (; offset + kBlockSize < length; offset += kBlockSize) {
crc = absl::ExtendCrc32c(crc,
absl::string_view(src_bytes + offset, kBlockSize));
memcpy(dst_bytes + offset, src_bytes + offset, kBlockSize);
}
// Save some work if length is 0.
if (offset < length) {
std::size_t final_copy_size = length - offset;
crc = absl::ExtendCrc32c(
crc, absl::string_view(src_bytes + offset, final_copy_size));
memcpy(dst_bytes + offset, src_bytes + offset, final_copy_size);
}
return crc;
}
// Compile the following only if we don't have
#ifndef __SSE4_2__
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines;
engines.temporal = new FallbackCrcMemcpyEngine();
engines.non_temporal = new FallbackCrcMemcpyEngine();
return engines;
}
std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
int /*integer*/) {
return std::make_unique<FallbackCrcMemcpyEngine>();
}
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,169 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/crc/internal/crc_memcpy.h"
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <utility>
#include "gtest/gtest.h"
#include "absl/crc/crc32c.h"
#include "absl/memory/memory.h"
#include "absl/random/distributions.h"
#include "absl/random/random.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
namespace {
enum CrcEngine {
X86 = 0,
NONTEMPORAL = 1,
FALLBACK = 2,
};
// Correctness tests:
// - Every source/destination byte alignment 0-15, every size 0-511 bytes
// - Arbitrarily aligned source, large size
template <size_t max_size>
class CrcMemcpyTest : public testing::Test {
protected:
CrcMemcpyTest() {
source_ = std::make_unique<char[]>(kSize);
destination_ = std::make_unique<char[]>(kSize);
}
static constexpr size_t kAlignment = 16;
static constexpr size_t kMaxCopySize = max_size;
static constexpr size_t kSize = kAlignment + kMaxCopySize;
std::unique_ptr<char[]> source_;
std::unique_ptr<char[]> destination_;
absl::BitGen gen_;
};
// Small test is slightly larger 4096 bytes to allow coverage of the "large"
// copy function. The minimum size to exercise all code paths in that function
// would be around 256 consecutive tests (getting every possible tail value
// and 0-2 small copy loops after the main block), so testing from 4096-4500
// will cover all of those code paths multiple times.
typedef CrcMemcpyTest<4500> CrcSmallTest;
typedef CrcMemcpyTest<(1 << 24)> CrcLargeTest;
// Parametrize the small test so that it can be done with all configurations.
template <typename ParamsT>
class x86ParamTestTemplate : public CrcSmallTest,
public ::testing::WithParamInterface<ParamsT> {
protected:
x86ParamTestTemplate() {
if (GetParam().crc_engine_selector == FALLBACK) {
engine_ = std::make_unique<absl::crc_internal::FallbackCrcMemcpyEngine>();
} else if (GetParam().crc_engine_selector == NONTEMPORAL) {
engine_ =
std::make_unique<absl::crc_internal::CrcNonTemporalMemcpyEngine>();
} else {
engine_ = absl::crc_internal::CrcMemcpy::GetTestEngine(
GetParam().vector_lanes, GetParam().integer_lanes);
}
}
// Convenience method.
ParamsT GetParam() const {
return ::testing::WithParamInterface<ParamsT>::GetParam();
}
std::unique_ptr<absl::crc_internal::CrcMemcpyEngine> engine_;
};
struct TestParams {
CrcEngine crc_engine_selector = X86;
int vector_lanes = 0;
int integer_lanes = 0;
};
using x86ParamTest = x86ParamTestTemplate<TestParams>;
// SmallCorrectness is designed to exercise every possible set of code paths
// in the memcpy code, not including the loop.
TEST_P(x86ParamTest, SmallCorrectnessCheckSourceAlignment) {
constexpr size_t kTestSizes[] = {0, 100, 255, 512, 1024, 4000, kMaxCopySize};
for (size_t source_alignment = 0; source_alignment < kAlignment;
source_alignment++) {
for (auto size : kTestSizes) {
char* base_data = static_cast<char*>(source_.get()) + source_alignment;
for (size_t i = 0; i < size; i++) {
*(base_data + i) =
static_cast<char>(absl::Uniform<unsigned char>(gen_));
}
absl::crc32c_t initial_crc =
absl::ToCrc32c(absl::Uniform<uint32_t>(gen_));
absl::crc32c_t experiment_crc =
engine_->Compute(destination_.get(), source_.get() + source_alignment,
size, initial_crc);
// Check the memory region to make sure it is the same
int mem_comparison =
memcmp(destination_.get(), source_.get() + source_alignment, size);
SCOPED_TRACE(absl::StrCat("Error in memcpy of size: ", size,
" with source alignment: ", source_alignment));
ASSERT_EQ(mem_comparison, 0);
absl::crc32c_t baseline_crc = absl::ExtendCrc32c(
initial_crc,
absl::string_view(
static_cast<char*>(source_.get()) + source_alignment, size));
ASSERT_EQ(baseline_crc, experiment_crc);
}
}
}
TEST_P(x86ParamTest, SmallCorrectnessCheckDestAlignment) {
constexpr size_t kTestSizes[] = {0, 100, 255, 512, 1024, 4000, kMaxCopySize};
for (size_t dest_alignment = 0; dest_alignment < kAlignment;
dest_alignment++) {
for (auto size : kTestSizes) {
char* base_data = static_cast<char*>(source_.get());
for (size_t i = 0; i < size; i++) {
*(base_data + i) =
static_cast<char>(absl::Uniform<unsigned char>(gen_));
}
absl::crc32c_t initial_crc =
absl::ToCrc32c(absl::Uniform<uint32_t>(gen_));
absl::crc32c_t experiment_crc =
engine_->Compute(destination_.get() + dest_alignment, source_.get(),
size, initial_crc);
// Check the memory region to make sure it is the same
int mem_comparison =
memcmp(destination_.get() + dest_alignment, source_.get(), size);
SCOPED_TRACE(absl::StrCat("Error in memcpy of size: ", size,
" with dest alignment: ", dest_alignment));
ASSERT_EQ(mem_comparison, 0);
absl::crc32c_t baseline_crc = absl::ExtendCrc32c(
initial_crc,
absl::string_view(static_cast<char*>(source_.get()), size));
ASSERT_EQ(baseline_crc, experiment_crc);
}
}
}
INSTANTIATE_TEST_SUITE_P(x86ParamTest, x86ParamTest,
::testing::Values(
// Tests for configurations that may occur in prod.
TestParams{X86, 3, 0}, TestParams{X86, 1, 2},
// Fallback test.
TestParams{FALLBACK, 0, 0},
// Non Temporal
TestParams{NONTEMPORAL, 0, 0}));
} // namespace

@ -0,0 +1,435 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
// XMM registers do not support the CRC instruction (yet). While copying,
// compute the running CRC of the data being copied.
//
// It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true.
//
// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
// here (unlike a traditional memcopy, which has almost no ALU use), so we will
// need to copy in such a way that the CRC unit is used efficiently. We have two
// regimes in this code:
// 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
// 2. For operations of size > kCrcSmallSize:
// a) compute an initial CRC + copy on a small amount of data to align the
// destination pointer on a 16-byte boundary.
// b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
// c) Do the copy and CRC of the 3 main regions, interleaving (start with
// full cache line copies for each region, then move to single 16 byte
// pieces per region).
// d) Combine the CRCs with CRC32C::Concat.
// e) Copy the tail and extend the CRC with the CRC of the tail.
// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
// takes a significant amount of time. A medium-sized approach could be added
// using 3 CRCs over fixed-size blocks where the zero-extensions required for
// CRC32C::Concat can be precomputed.
#include <cstddef>
#include <cstdint>
#include "absl/crc/crc32c.h"
#include "absl/strings/string_view.h"
#ifdef __SSE4_2__
#include <emmintrin.h>
#include <x86intrin.h>
#include <type_traits>
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/prefetch.h"
#include "absl/base/optimization.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc_memcpy.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
namespace {
inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
crc32c_t crc) {
// Small copy: just go 1 byte at a time: being nice to the branch predictor
// is more important here than anything else
uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src);
crc_uint32 = _mm_crc32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data;
++src;
++dst;
}
return ToCrc32c(crc_uint32);
}
constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
template <int vec_regions, int int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
__m128i data[vec_regions];
uint64_t int_data[kIntLoadsPerVec * int_regions];
while (copy_rounds > 0) {
#pragma unroll_completely
for (int i = 0; i < vec_regions; i++) {
int region = i;
auto* vsrc =
reinterpret_cast<const __m128i_u*>(*src + region_size * region);
auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
// Load the blocks, unaligned
data[i] = _mm_loadu_si128(vsrc);
// Store the blocks, aligned
_mm_store_si128(vdst, data[i]);
// Compute the running CRC
crcs[region] = ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(data[i], 0)));
crcs[region] = ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(data[i], 1)));
}
#pragma unroll_completely
for (int i = 0; i < int_regions; i++) {
int region = vec_regions + i;
auto* usrc =
reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
#pragma unroll_completely
for (int j = 0; j < kIntLoadsPerVec; j++) {
int data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
crcs[region] = ToCrc32c(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]));
*(udst + j) = int_data[data_index];
}
}
// Increment pointers
*src += sizeof(__m128i);
*dst += sizeof(__m128i);
--copy_rounds;
}
}
} // namespace
template <int vec_regions, int int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
public:
AcceleratedCrcMemcpyEngine() = default;
AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
delete;
crc32c_t Compute(void* __restrict dst, const void* __restrict src,
std::size_t length, crc32c_t initial_crc) const override;
};
template <int vec_regions, int int_regions>
crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
// Number of blocks per cacheline.
constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
char* dst_bytes = static_cast<char*>(dst);
const char* src_bytes = static_cast<const char*>(src);
// Make sure that one prefetch per big block is enough to cover the whole
// dataset, and we don't prefetch too much.
static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
"Cache lines are not divided evenly into blocks, may have "
"unintended behavior!");
// Experimentally-determined boundary between a small and large copy.
// Below this number, spin-up and concatenation of CRCs takes enough time that
// it kills the throughput gains of using 3 regions and wide vectors.
constexpr size_t kCrcSmallSize = 256;
// Experimentally-determined prefetch distance. Main loop copies will
// prefeth data 2 cache lines ahead.
constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
// Small-size CRC-memcpy : just do CRC + memcpy
if (length < kCrcSmallSize) {
crc32c_t crc =
ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
memcpy(dst, src, length);
return crc;
}
// Start work on the CRC: undo the XOR from the previous calculation or set up
// the initial value of the CRC.
// initial_crc ^= kCrcDataXor;
initial_crc = initial_crc ^ kCrcDataXor;
// Do an initial alignment copy, so we can use aligned store instructions to
// the destination pointer. We align the destination pointer because the
// penalty for an unaligned load is small compared to the penalty of an
// unaligned store on modern CPUs.
std::size_t bytes_from_last_aligned =
reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
if (bytes_from_last_aligned != 0) {
std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
// Do the short-sized copy and CRC.
initial_crc =
ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
src_bytes += bytes_for_alignment;
dst_bytes += bytes_for_alignment;
length -= bytes_for_alignment;
}
// We are going to do the copy and CRC in kRegions regions to make sure that
// we can saturate the CRC unit. The CRCs will be combined at the end of the
// run. Copying will use the SSE registers, and we will extract words from
// the SSE registers to add to the CRC. Initially, we run the loop one full
// cache line per region at a time, in order to insert prefetches.
// Initialize CRCs for kRegions regions.
crc32c_t crcs[kRegions];
crcs[0] = initial_crc;
for (int i = 1; i < kRegions; i++) {
crcs[i] = kCrcDataXor;
}
// Find the number of rounds to copy and the region size. Also compute the
// tail size here.
int64_t copy_rounds = length / kCopyRoundSize;
// Find the size of each region and the size of the tail.
const std::size_t region_size = copy_rounds * kBlockSize;
const std::size_t tail_size = length - (kRegions * region_size);
// Holding registers for data in each region.
__m128i vec_data[vec_regions];
uint64_t int_data[int_regions * kIntLoadsPerVec];
// Main loop.
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
#pragma unroll_completely
for (int i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i);
absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
region_size * i);
}
// Load and store data, computing CRC on the way.
#pragma unroll_completely
for (int i = 0; i < kBlocksPerCacheLine; i++) {
// Copy and CRC the data for the CRC regions.
#pragma unroll_completely
for (int j = 0; j < vec_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
int region = (j + i) % kRegions;
auto* src = reinterpret_cast<const __m128i_u*>(src_bytes +
region_size * region);
auto* dst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data.
vec_data[j] = _mm_loadu_si128(src + i);
crcs[region] =
ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(vec_data[j], 0)));
crcs[region] =
ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
_mm_extract_epi64(vec_data[j], 1)));
// Store the data.
_mm_store_si128(dst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
#pragma unroll_completely
for (int j = 0; j < int_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
int region = (j + vec_regions + i) % kRegions;
auto* usrc =
reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
#pragma unroll_completely
for (int k = 0; k < kIntLoadsPerVec; k++) {
int data_index = j * kIntLoadsPerVec + k;
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
crcs[region] = ToCrc32c(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]));
// Store the data.
*(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
}
}
}
// Increment pointers
src_bytes += kBlockSize * kBlocksPerCacheLine;
dst_bytes += kBlockSize * kBlocksPerCacheLine;
copy_rounds -= kBlocksPerCacheLine;
}
// Copy and CRC the tails of each region.
LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
region_size, copy_rounds);
// Move the source and destination pointers to the end of the region
src_bytes += region_size * (kRegions - 1);
dst_bytes += region_size * (kRegions - 1);
// Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
// XOR done before doing block copy + CRCs.
for (int i = 0; i < kRegions - 1; i++) {
crcs[i] = crcs[i] ^ kCrcDataXor;
}
// Build a CRC of the first kRegions - 1 regions.
crc32c_t full_crc = crcs[0];
for (int i = 1; i < kRegions - 1; i++) {
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
// Copy and CRC the tail through the XMM registers.
std::size_t tail_blocks = tail_size / kBlockSize;
LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
tail_blocks);
// Final tail copy for under 16 bytes.
crcs[kRegions - 1] =
ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
crcs[kRegions - 1]);
// Finalize and concatenate the final CRC, then return.
crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
#ifdef UNDEFINED_BEHAVIOR_SANITIZER
// UBSAN does not play nicely with unaligned loads (which we use a lot).
// Get the underlying architecture.
CpuType cpu_type = GetCpuType();
switch (cpu_type) {
case CpuType::kUnknown:
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelSkylake:
case CpuType::kIntelBroadwell:
case CpuType::kIntelHaswell:
case CpuType::kIntelIvybridge:
return {
.temporal = new FallbackCrcMemcpyEngine(),
.non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
};
// INTEL_SANDYBRIDGE performs better with SSE than AVX.
case CpuType::kIntelSandybridge:
return {
.temporal = new FallbackCrcMemcpyEngine(),
.non_temporal = new CrcNonTemporalMemcpyEngine(),
};
default:
return {.temporal = new FallbackCrcMemcpyEngine(),
.non_temporal = new FallbackCrcMemcpyEngine()};
}
#else
// Get the underlying architecture.
CpuType cpu_type = GetCpuType();
switch (cpu_type) {
// On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
// which data movement from the vector registers to the integer registers
// (where CRC32C happens) to crowd the same units as vector stores. As a
// result, using that path exclusively causes bottlenecking on this port.
// We can avoid this bottleneck by using the integer side of the CPU for
// most operations rather than the vector side. We keep a vector region to
// engage some of the prefetching logic in the cache hierarchy which seems
// to give vector instructions special treatment. These prefetch units see
// strided access to each region, and do the right thing.
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
return {
.temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
.non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
};
// PCLMULQDQ is slow and we don't have wide enough issue width to take
// advantage of it. For an unknown architecture, don't risk using CLMULs.
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelSkylake:
case CpuType::kIntelBroadwell:
case CpuType::kIntelHaswell:
case CpuType::kIntelIvybridge:
return {
.temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
.non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
};
// INTEL_SANDYBRIDGE performs better with SSE than AVX.
case CpuType::kIntelSandybridge:
return {
.temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
.non_temporal = new CrcNonTemporalMemcpyEngine(),
};
default:
return {.temporal = new FallbackCrcMemcpyEngine(),
.non_temporal = new FallbackCrcMemcpyEngine()};
}
#endif // UNDEFINED_BEHAVIOR_SANITIZER
}
// For testing, allow the user to specify which engine they want.
std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
int integer) {
if (vector == 3 && integer == 0) {
return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
} else if (vector == 1 && integer == 2) {
return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
}
return nullptr;
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // __SSE4_2__

@ -0,0 +1,93 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstdint>
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/crc/internal/non_temporal_memcpy.h"
#include "absl/strings/string_view.h"
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
crc32c_t CrcNonTemporalMemcpyEngine::Compute(void* __restrict dst,
const void* __restrict src,
std::size_t length,
crc32c_t initial_crc) const {
constexpr size_t kBlockSize = 8192;
crc32c_t crc = initial_crc;
const char* src_bytes = reinterpret_cast<const char*>(src);
char* dst_bytes = reinterpret_cast<char*>(dst);
// Copy + CRC loop - run 8k chunks until we are out of full chunks.
std::size_t offset = 0;
for (; offset + kBlockSize < length; offset += kBlockSize) {
crc = absl::ExtendCrc32c(crc,
absl::string_view(src_bytes + offset, kBlockSize));
non_temporal_store_memcpy(dst_bytes + offset, src_bytes + offset,
kBlockSize);
}
// Save some work if length is 0.
if (offset < length) {
std::size_t final_copy_size = length - offset;
crc = ExtendCrc32c(crc,
absl::string_view(src_bytes + offset, final_copy_size));
non_temporal_store_memcpy(dst_bytes + offset, src_bytes + offset,
final_copy_size);
}
return crc;
}
crc32c_t CrcNonTemporalMemcpyAVXEngine::Compute(void* __restrict dst,
const void* __restrict src,
std::size_t length,
crc32c_t initial_crc) const {
constexpr size_t kBlockSize = 8192;
crc32c_t crc = initial_crc;
const char* src_bytes = reinterpret_cast<const char*>(src);
char* dst_bytes = reinterpret_cast<char*>(dst);
// Copy + CRC loop - run 8k chunks until we are out of full chunks.
std::size_t offset = 0;
for (; offset + kBlockSize < length; offset += kBlockSize) {
crc = ExtendCrc32c(crc, absl::string_view(src_bytes + offset, kBlockSize));
non_temporal_store_memcpy_avx(dst_bytes + offset, src_bytes + offset,
kBlockSize);
}
// Save some work if length is 0.
if (offset < length) {
std::size_t final_copy_size = length - offset;
crc = ExtendCrc32c(crc,
absl::string_view(src_bytes + offset, final_copy_size));
non_temporal_store_memcpy_avx(dst_bytes + offset, src_bytes + offset,
final_copy_size);
}
return crc;
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,691 @@
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Hardware accelerated CRC32 computation on Intel and ARM architecture.
#include <stddef.h>
#include <cstdint>
#include "absl/base/attributes.h"
#include "absl/base/call_once.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_internal.h"
#include "absl/memory/memory.h"
#include "absl/numeric/bits.h"
#if defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON)
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#elif defined(__SSE4_2__) && defined(__PCLMUL__)
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
#if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
// Implementation details not exported outside of file
namespace {
// Some machines have CRC acceleration hardware.
// We can do a faster version of Extend() on such machines.
class CRC32AcceleratedX86ARMCombined : public CRC32 {
public:
CRC32AcceleratedX86ARMCombined() {}
~CRC32AcceleratedX86ARMCombined() override {}
void ExtendByZeroes(uint32_t* crc, size_t length) const override;
uint32_t ComputeZeroConstant(size_t length) const;
private:
CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
delete;
CRC32AcceleratedX86ARMCombined& operator=(
const CRC32AcceleratedX86ARMCombined&) = delete;
};
// Constants for switching between algorithms.
// Chosen by comparing speed at different powers of 2.
constexpr int kSmallCutoff = 256;
constexpr int kMediumCutoff = 2048;
#define ABSL_INTERNAL_STEP1(crc) \
do { \
crc = CRC32_u8(crc, *p++); \
} while (0)
#define ABSL_INTERNAL_STEP2(crc) \
do { \
crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
p += 2; \
} while (0)
#define ABSL_INTERNAL_STEP4(crc) \
do { \
crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
p += 4; \
} while (0)
#define ABSL_INTERNAL_STEP8(crc, data) \
do { \
crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
data += 8; \
} while (0)
#define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
do { \
ABSL_INTERNAL_STEP8(crc0, p0); \
ABSL_INTERNAL_STEP8(crc1, p1); \
} while (0)
#define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
do { \
ABSL_INTERNAL_STEP8(crc0, p0); \
ABSL_INTERNAL_STEP8(crc1, p1); \
ABSL_INTERNAL_STEP8(crc2, p2); \
} while (0)
uint32_t multiply(uint32_t a, uint32_t b) {
V128 shifts = V128_From2x64(0, 1);
V128 power = V128_From2x64(0, a);
V128 crc = V128_From2x64(0, b);
V128 res = V128_PMulLow(power, crc);
// Combine crc values
res = V128_ShiftLeft64(res, shifts);
return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
}
namespace {
// Powers of crc32c polynomial, for faster ExtendByZeros.
// Verified against folly:
// folly/hash/detail/Crc32CombineDetail.cpp
constexpr uint32_t kCRC32CPowers[] = {
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
0x00800000, 0x00008000,
};
} // namespace
// Compute a magic constant, so that multiplying by it is the same as
// extending crc by length zeros.
uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
size_t length) const {
// Lowest 2 bits are handled separately in ExtendByZeroes
length >>= 2;
int index = absl::countr_zero(length);
uint32_t prev = kCRC32CPowers[index];
length &= length - 1;
while (length) {
// For each bit of length, extend by 2**n zeros.
index = absl::countr_zero(length);
prev = multiply(prev, kCRC32CPowers[index]);
length &= length - 1;
}
return prev;
}
void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
size_t length) const {
uint32_t val = *crc;
// Don't bother with multiplication for small length.
switch (length & 3) {
case 0:
break;
case 1:
val = CRC32_u8(val, 0);
break;
case 2:
val = CRC32_u16(val, 0);
break;
case 3:
val = CRC32_u8(val, 0);
val = CRC32_u16(val, 0);
break;
}
if (length > 3) {
val = multiply(val, ComputeZeroConstant(length));
}
*crc = val;
}
// Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
// Instruction"
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
// We only need every 4th value, because we unroll loop by 4.
constexpr uint64_t kClmulConstants[] = {
0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
};
enum class CutoffStrategy {
// Use 3 CRC streams to fold into 1.
Fold3,
// Unroll CRC instructions for 64 bytes.
Unroll64CRC,
};
template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
class CRC32AcceleratedX86ARMCombinedMultipleStreams
: public CRC32AcceleratedX86ARMCombined {
ABSL_ATTRIBUTE_HOT
void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
"Invalid number of crc streams");
static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
"Invalid number of pclmul streams");
const uint8_t* p = static_cast<const uint8_t*>(bytes);
const uint8_t* e = p + length;
uint32_t l = *crc;
uint64_t l64;
// We have dedicated instruction for 1,2,4 and 8 bytes.
if (length & 8) {
ABSL_INTERNAL_STEP8(l, p);
length &= ~8LL;
}
if (length & 4) {
ABSL_INTERNAL_STEP4(l);
length &= ~4LL;
}
if (length & 2) {
ABSL_INTERNAL_STEP2(l);
length &= ~2LL;
}
if (length & 1) {
ABSL_INTERNAL_STEP1(l);
length &= ~1LL;
}
if (length == 0) {
*crc = l;
return;
}
// length is now multiple of 16.
// For small blocks just run simple loop, because cost of combining multiple
// streams is significant.
if (strategy != CutoffStrategy::Unroll64CRC) {
if (length < kSmallCutoff) {
while (length >= 16) {
ABSL_INTERNAL_STEP8(l, p);
ABSL_INTERNAL_STEP8(l, p);
length -= 16;
}
*crc = l;
return;
}
}
// For medium blocks we run 3 crc streams and combine them as described in
// Intel paper above. Running 4th stream doesn't help, because crc
// instruction has latency 3 and throughput 1.
if (length < kMediumCutoff) {
l64 = l;
if (strategy == CutoffStrategy::Fold3) {
uint64_t l641 = 0;
uint64_t l642 = 0;
const int blockSize = 32;
int64_t bs = (e - p) / kGroupsSmall / blockSize;
const uint8_t* p1 = p + bs * blockSize;
const uint8_t* p2 = p1 + bs * blockSize;
for (int64_t i = 0; i < bs - 1; ++i) {
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
}
// Don't run crc on last 8 bytes.
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
V128 tmp = V128_From2x64(0, l64);
V128 res1 = V128_PMulLow(tmp, magic);
tmp = V128_From2x64(0, l641);
V128 res2 = V128_PMul10(tmp, magic);
V128 x = V128_Xor(res1, res2);
l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
l64 = CRC32_u64(l642, l64);
p = p2 + 8;
} else if (strategy == CutoffStrategy::Unroll64CRC) {
while ((e - p) >= 64) {
l64 = Process64BytesCRC(p, l64);
p += 64;
}
}
} else {
// There is a lot of data, we can ignore combine costs and run all
// requested streams (num_crc_streams + num_pclmul_streams),
// using prefetch. CRC and PCLMULQDQ use different cpu execution units,
// so on some cpus it makes sense to execute both of them for different
// streams.
// Point x at first 8-byte aligned byte in string.
const uint8_t* x = RoundUp<8>(p);
// Process bytes until p is 8-byte aligned, if that isn't past the end.
while (p != x) {
ABSL_INTERNAL_STEP1(l);
}
int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
const uint8_t* crc_streams[kMaxStreams];
const uint8_t* pclmul_streams[kMaxStreams];
// We are guaranteed to have at least one crc stream.
crc_streams[0] = p;
for (int i = 1; i < num_crc_streams; i++) {
crc_streams[i] = crc_streams[i - 1] + bs * 64;
}
pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
for (int i = 1; i < num_pclmul_streams; i++) {
pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
}
// Per stream crc sums.
uint64_t l64_crc[kMaxStreams] = {l};
uint64_t l64_pclmul[kMaxStreams] = {0};
// Peel first iteration, because PCLMULQDQ stream, needs setup.
for (int i = 0; i < num_crc_streams; i++) {
l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
crc_streams[i] += 16 * 4;
}
V128 partialCRC[kMaxStreams][4];
for (int i = 0; i < num_pclmul_streams; i++) {
partialCRC[i][0] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
partialCRC[i][1] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
partialCRC[i][2] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
partialCRC[i][3] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
pclmul_streams[i] += 16 * 4;
}
for (int64_t i = 1; i < bs; i++) {
// Prefetch data for next itterations.
for (int j = 0; j < num_crc_streams; j++) {
base_internal::PrefetchT0(
reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
}
for (int j = 0; j < num_pclmul_streams; j++) {
base_internal::PrefetchT0(reinterpret_cast<const char*>(
pclmul_streams[j] + kPrefetchHorizon));
}
// We process each stream in 64 byte blocks. This can be written as
// for (int i = 0; i < num_pclmul_streams; i++) {
// Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
// pclmul_streams[i] += 16 * 4;
// }
// for (int i = 0; i < num_crc_streams; i++) {
// l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
// crc_streams[i] += 16*4;
// }
// But unrolling and interleaving PCLMULQDQ and CRC blocks manually
// gives ~2% performance boost.
l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
crc_streams[0] += 16 * 4;
if (num_pclmul_streams > 0) {
Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
pclmul_streams[0] += 16 * 4;
}
if (num_crc_streams > 1) {
l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
crc_streams[1] += 16 * 4;
}
if (num_pclmul_streams > 1) {
Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
pclmul_streams[1] += 16 * 4;
}
if (num_crc_streams > 2) {
l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
crc_streams[2] += 16 * 4;
}
if (num_pclmul_streams > 2) {
Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
pclmul_streams[2] += 16 * 4;
}
}
// PCLMULQDQ based streams require special final step;
// CRC based don't.
for (int i = 0; i < num_pclmul_streams; i++) {
l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
}
// Combine all streams into single result.
uint32_t magic = ComputeZeroConstant(bs * 64);
l64 = l64_crc[0];
for (int i = 1; i < num_crc_streams; i++) {
l64 = multiply(l64, magic);
l64 ^= l64_crc[i];
}
for (int i = 0; i < num_pclmul_streams; i++) {
l64 = multiply(l64, magic);
l64 ^= l64_pclmul[i];
}
// Update p.
if (num_pclmul_streams > 0) {
p = pclmul_streams[num_pclmul_streams - 1];
} else {
p = crc_streams[num_crc_streams - 1];
}
}
l = l64;
while ((e - p) >= 16) {
ABSL_INTERNAL_STEP8(l, p);
ABSL_INTERNAL_STEP8(l, p);
}
// Process the last few bytes
while (p != e) {
ABSL_INTERNAL_STEP1(l);
}
#undef ABSL_INTERNAL_STEP8BY3
#undef ABSL_INTERNAL_STEP8BY2
#undef ABSL_INTERNAL_STEP8
#undef ABSL_INTERNAL_STEP4
#undef ABSL_INTERNAL_STEP2
#undef ABSL_INTERNAL_STEP1
*crc = l;
}
private:
// Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
// would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
// high latency, so we run 4 128-bit partial checksums that can be reduced to
// a single value by FinalizePclmulStream later. Computing crc for arbitrary
// polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
// Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
// We are applying it to CRC32C polynomial.
ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
const uint8_t* p, V128* partialCRC) const {
V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
V128 partialCRC1 = partialCRC[0];
V128 partialCRC2 = partialCRC[1];
V128 partialCRC3 = partialCRC[2];
V128 partialCRC4 = partialCRC[3];
V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
partialCRC1 = V128_Xor(tmp1, partialCRC1);
partialCRC2 = V128_Xor(tmp2, partialCRC2);
partialCRC3 = V128_Xor(tmp3, partialCRC3);
partialCRC4 = V128_Xor(tmp4, partialCRC4);
partialCRC1 = V128_Xor(partialCRC1, data1);
partialCRC2 = V128_Xor(partialCRC2, data2);
partialCRC3 = V128_Xor(partialCRC3, data3);
partialCRC4 = V128_Xor(partialCRC4, data4);
partialCRC[0] = partialCRC1;
partialCRC[1] = partialCRC2;
partialCRC[2] = partialCRC3;
partialCRC[3] = partialCRC4;
}
// Reduce partialCRC produced by Process64BytesPclmul into a single value,
// that represents crc checksum of all the processed bytes.
ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
FinalizePclmulStream(V128* partialCRC) const {
V128 partialCRC1 = partialCRC[0];
V128 partialCRC2 = partialCRC[1];
V128 partialCRC3 = partialCRC[2];
V128 partialCRC4 = partialCRC[3];
// Combine 4 vectors of partial crc into a single vector.
V128 reductionMultiplicands =
V128_Load(reinterpret_cast<const V128*>(k5k6));
V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
partialCRC1 = V128_Xor(low, high);
partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
low = V128_PMulLow(reductionMultiplicands, partialCRC3);
high = V128_PMulHi(reductionMultiplicands, partialCRC3);
partialCRC3 = V128_Xor(low, high);
partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
low = V128_PMulLow(reductionMultiplicands, partialCRC1);
high = V128_PMulHi(reductionMultiplicands, partialCRC1);
V128 fullCRC = V128_Xor(low, high);
fullCRC = V128_Xor(fullCRC, partialCRC3);
// Reduce fullCRC into scalar value.
reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
fullCRC = V128_ShiftRight<8>(fullCRC);
fullCRC = V128_Xor(fullCRC, tmp);
reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
tmp = V128_ShiftRight<4>(fullCRC);
fullCRC = V128_And(fullCRC, mask);
fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
fullCRC = V128_Xor(tmp, fullCRC);
reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
tmp = V128_And(fullCRC, mask);
tmp = V128_PMul01(reductionMultiplicands, tmp);
tmp = V128_And(tmp, mask);
tmp = V128_PMulLow(reductionMultiplicands, tmp);
fullCRC = V128_Xor(tmp, fullCRC);
return V128_Extract32<1>(fullCRC);
}
// Update crc with 64 bytes of data from p.
ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
uint64_t crc) const {
for (int i = 0; i < 8; i++) {
crc = CRC32_u64(crc, absl::little_endian::Load64(p));
p += 8;
}
return crc;
}
// Generated by crc32c_x86_test --crc32c_generate_constants=true
// and verified against constants in linux kernel for S390:
// https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
// Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
// size. Each group is CRCed in parallel then combined at the end of the
// block.
static constexpr int kGroupsSmall = 3;
// For large runs we use up to kMaxStreams blocks computed with CRC
// instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
// are combined in the end.
static constexpr int kMaxStreams = 3;
};
} // namespace
// Intel processors with SSE4.2 have an instruction for one particular
// 32-bit CRC polynomial: crc32c
CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
CpuType type = GetCpuType();
switch (type) {
case CpuType::kIntelHaswell:
case CpuType::kAmdRome:
case CpuType::kAmdNaples:
case CpuType::kAmdMilan:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 1, CutoffStrategy::Fold3>();
// PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
case CpuType::kIntelCascadelakeXeon:
case CpuType::kIntelSkylakeXeon:
case CpuType::kIntelBroadwell:
case CpuType::kIntelSkylake:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 2, CutoffStrategy::Fold3>();
// PCLMULQDQ is slow, don't use it.
case CpuType::kIntelIvybridge:
case CpuType::kIntelSandybridge:
case CpuType::kIntelWestmere:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 0, CutoffStrategy::Fold3>();
case CpuType::kArmNeoverseN1:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Unroll64CRC>();
#if defined(__aarch64__)
default:
// Not all ARM processors support the needed instructions, so check here
// before trying to use an accelerated implementation.
if (SupportsArmCRC32PMULL()) {
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Unroll64CRC>();
} else {
return nullptr;
}
#else
default:
// Something else, play it safe and assume slow PCLMULQDQ.
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 0, CutoffStrategy::Fold3>();
#endif
}
}
std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
auto ret = std::vector<std::unique_ptr<CRCImpl>>();
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 0, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 2, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 3, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 0, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 1, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 2, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 3, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 0, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 1, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 2, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 3, CutoffStrategy::Fold3>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 0, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 2, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 3, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 0, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 1, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 2, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
2, 3, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 0, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 1, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 2, CutoffStrategy::Unroll64CRC>>());
ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
3, 3, CutoffStrategy::Unroll64CRC>>());
return ret;
}
#else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
return std::vector<std::unique_ptr<CRCImpl>>();
}
// no hardware acceleration available
CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl

@ -0,0 +1,77 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
#define ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_
#ifdef __aarch64__
#include <arm_neon.h>
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
#define vreinterpretq_s64_m128i(x) (x)
// Guarantees that every preceding store is globally visible before any
// subsequent store.
// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
static inline __attribute__((always_inline)) void _mm_sfence(void) {
__sync_synchronize();
}
// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
// may perform better than _mm_loadu_si128 when the data crosses a cache line
// boundary.
//
// dst[127:0] := MEM[mem_addr+127:mem_addr]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
#define _mm_lddqu_si128 _mm_loadu_si128
// Loads 128-bit value. :
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
const __m128i *p) {
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
}
// Stores the data in a to the address p without polluting the caches. If the
// cache line containing address p is already in the cache, the cache will be
// updated.
// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
static inline __attribute__((always_inline)) void _mm_stream_si128(__m128i *p,
__m128i a) {
#if __has_builtin(__builtin_nontemporal_store)
__builtin_nontemporal_store(a, p);
#else
vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
#endif
}
// Sets the 16 signed 8-bit integer values.
// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
static inline __attribute__((always_inline)) __m128i _mm_set_epi8(
signed char b15, signed char b14, signed char b13, signed char b12,
signed char b11, signed char b10, signed char b9, signed char b8,
signed char b7, signed char b6, signed char b5, signed char b4,
signed char b3, signed char b2, signed char b1, signed char b0) {
int8_t __attribute__((aligned(16)))
data[16] = {(int8_t)b0, (int8_t)b1, (int8_t)b2, (int8_t)b3,
(int8_t)b4, (int8_t)b5, (int8_t)b6, (int8_t)b7,
(int8_t)b8, (int8_t)b9, (int8_t)b10, (int8_t)b11,
(int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
return (__m128i)vld1q_s8(data);
}
#endif // __aarch64__
#endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_ARM_INTRINSICS_H_

@ -0,0 +1,172 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
#define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
#include <algorithm>
#include <cassert>
#include <cstring>
#include <iostream>
#include "absl/base/config.h"
#include "absl/base/optimization.h"
#ifdef __SSE__
// Only include if we're running on a CPU that supports SSE ISA, needed for
// sfence
#include <immintrin.h> // IWYU pragma: keep
#endif
#ifdef __SSE2__
// Only include if we're running on a CPU that supports SSE2 ISA, needed for
// movdqa, movdqu, movntdq
#include <emmintrin.h> // IWYU pragma: keep
#endif
#ifdef __aarch64__
// Only include if we're running on a CPU that supports ARM NEON ISA, needed for
// sfence, movdqa, movdqu, movntdq
#include "absl/crc/internal/non_temporal_arm_intrinsics.h"
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
// This non-temporal memcpy does regular load and non-temporal store memory
// copy. It is compatible to both 16-byte aligned and unaligned addresses. If
// data at the destination is not immediately accessed, using non-temporal
// memcpy can save 1 DRAM load of the destination cacheline.
constexpr int kCacheLineSize = ABSL_CACHELINE_SIZE;
// If the objects overlap, the behavior is undefined.
// MSVC does not have proper header support for some of these intrinsics,
// so it should go to fallback
inline void *non_temporal_store_memcpy(void *__restrict dst,
const void *__restrict src, size_t len) {
#if (defined(__SSE3__) || defined(__aarch64__)) && !defined(_MSC_VER)
uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
// memcpy() the misaligned header. At the end of this if block, <d> is
// aligned to a 64-byte cacheline boundary or <len> == 0.
if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
int header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
s += header_len;
len -= header_len;
}
if (len >= kCacheLineSize) {
_mm_sfence();
__m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
uint64_t loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m128i temp1, temp2, temp3, temp4;
temp1 = _mm_lddqu_si128(src_cacheline + 0);
temp2 = _mm_lddqu_si128(src_cacheline + 1);
temp3 = _mm_lddqu_si128(src_cacheline + 2);
temp4 = _mm_lddqu_si128(src_cacheline + 3);
_mm_stream_si128(dst_cacheline + 0, temp1);
_mm_stream_si128(dst_cacheline + 1, temp2);
_mm_stream_si128(dst_cacheline + 2, temp3);
_mm_stream_si128(dst_cacheline + 3, temp4);
src_cacheline += kOpsPerCacheLine;
dst_cacheline += kOpsPerCacheLine;
len -= kCacheLineSize;
}
d += loops * kCacheLineSize;
s += loops * kCacheLineSize;
_mm_sfence();
}
// memcpy the tail.
if (len) {
memcpy(d, s, len);
}
return dst;
#else
// Fallback to regular memcpy when SSE2/3 & aarch64 is not available.
return memcpy(dst, src, len);
#endif // __SSE3__ || __aarch64__
}
// MSVC does not have proper header support for some of these intrinsics,
// so it should go to fallback
inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
const void *__restrict src,
size_t len) {
#if defined(__AVX__) && !defined(_MSC_VER)
uint8_t *d = reinterpret_cast<uint8_t *>(dst);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
// memcpy() the misaligned header. At the end of this if block, <d> is
// aligned to a 64-byte cacheline boundary or <len> == 0.
if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
int header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
s += header_len;
len -= header_len;
}
if (len >= kCacheLineSize) {
_mm_sfence();
__m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
int loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m256i temp1, temp2;
temp1 = _mm256_lddqu_si256(src_cacheline + 0);
temp2 = _mm256_lddqu_si256(src_cacheline + 1);
_mm256_stream_si256(dst_cacheline + 0, temp1);
_mm256_stream_si256(dst_cacheline + 1, temp2);
src_cacheline += kOpsPerCacheLine;
dst_cacheline += kOpsPerCacheLine;
len -= kCacheLineSize;
}
d += loops * kCacheLineSize;
s += loops * kCacheLineSize;
_mm_sfence();
}
// memcpy the tail.
if (len) {
memcpy(d, s, len);
}
return dst;
#else
// Fallback to regular memcpy when AVX is not available.
return memcpy(dst, src, len);
#endif // __AVX__
}
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_

@ -0,0 +1,88 @@
// Copyright 2022 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/crc/internal/non_temporal_memcpy.h"
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <vector>
#include "gtest/gtest.h"
namespace {
struct TestParam {
size_t copy_size;
uint32_t src_offset;
uint32_t dst_offset;
};
class NonTemporalMemcpyTest : public testing::TestWithParam<TestParam> {
protected:
void SetUp() override {
// Make buf_size multiple of 16 bytes.
size_t buf_size = ((std::max(GetParam().src_offset, GetParam().dst_offset) +
GetParam().copy_size) +
15) /
16 * 16;
a_.resize(buf_size);
b_.resize(buf_size);
for (size_t i = 0; i < buf_size; i++) {
a_[i] = i % 256;
b_[i] = ~a_[i];
}
}
std::vector<uint8_t> a_, b_;
};
TEST_P(NonTemporalMemcpyTest, SSEEquality) {
uint8_t *src = a_.data() + GetParam().src_offset;
uint8_t *dst = b_.data() + GetParam().dst_offset;
absl::crc_internal::non_temporal_store_memcpy(dst, src, GetParam().copy_size);
for (size_t i = 0; i < GetParam().copy_size; i++) {
EXPECT_EQ(src[i], dst[i]);
}
}
TEST_P(NonTemporalMemcpyTest, AVXEquality) {
uint8_t* src = a_.data() + GetParam().src_offset;
uint8_t* dst = b_.data() + GetParam().dst_offset;
absl::crc_internal::non_temporal_store_memcpy_avx(dst, src,
GetParam().copy_size);
for (size_t i = 0; i < GetParam().copy_size; i++) {
EXPECT_EQ(src[i], dst[i]);
}
}
// 63B is smaller than one cacheline operation thus the non-temporal routine
// will not be called.
// 4352B is sufficient for testing 4092B data copy with room for offsets.
constexpr TestParam params[] = {
{63, 0, 0}, {58, 5, 5}, {61, 2, 0}, {61, 0, 2},
{58, 5, 2}, {4096, 0, 0}, {4096, 0, 1}, {4096, 0, 2},
{4096, 0, 3}, {4096, 0, 4}, {4096, 0, 5}, {4096, 0, 6},
{4096, 0, 7}, {4096, 0, 8}, {4096, 0, 9}, {4096, 0, 10},
{4096, 0, 11}, {4096, 0, 12}, {4096, 0, 13}, {4096, 0, 14},
{4096, 0, 15}, {4096, 7, 7}, {4096, 3, 0}, {4096, 1, 0},
{4096, 9, 3}, {4096, 9, 11}, {8192, 0, 0}, {8192, 5, 2},
{1024768, 7, 11}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0},
{1, 1, 1}};
INSTANTIATE_TEST_SUITE_P(ParameterizedNonTemporalMemcpyTest,
NonTemporalMemcpyTest, testing::ValuesIn(params));
} // namespace
Loading…
Cancel
Save