From 1e15682f1a4bb64c48b84884976a2b5c4201e878 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 27 Dec 2021 13:27:22 -0500 Subject: [PATCH] Enable SHA-512 ARM acceleration when available. This imports the changes to sha512-armv8.pl from upstream's af0fcf7b4668218b24d9250b95e0b96939ccb4d1. Tweaks needed: - Add an explicit .text because we put .LK$BITS in .rodata for XOM - .LK$bits and code are in separate sections, so use adrp/add instead of plain adr - Where glibc needs feature flags to *enable* pthread_rwlock, Apple interprets _XOPEN_SOURCE as a request to *disable* Apple extensions. Tighten the condition on the _XOPEN_SOURCE check. Added support for macOS and Linux, tested manually on an ARM Mac and a VM, respectively. Fuchsia and Windows do not currently have APIs to expose this bit, so I've left in TODOs. Benchmarks from an Apple M1 Max: Before: Did 4647000 SHA-512 (16 bytes) operations in 1000103us (74.3 MB/sec) Did 1614000 SHA-512 (256 bytes) operations in 1000379us (413.0 MB/sec) Did 439000 SHA-512 (1350 bytes) operations in 1001694us (591.6 MB/sec) Did 76000 SHA-512 (8192 bytes) operations in 1011821us (615.3 MB/sec) Did 39000 SHA-512 (16384 bytes) operations in 1024311us (623.8 MB/sec) After: Did 10369000 SHA-512 (16 bytes) operations in 1000088us (165.9 MB/sec) [+123.1%] Did 3650000 SHA-512 (256 bytes) operations in 1000079us (934.3 MB/sec) [+126.2%] Did 1029000 SHA-512 (1350 bytes) operations in 1000521us (1388.4 MB/sec) [+134.7%] Did 175000 SHA-512 (8192 bytes) operations in 1001874us (1430.9 MB/sec) [+132.5%] Did 89000 SHA-512 (16384 bytes) operations in 1010314us (1443.3 MB/sec) [+131.4%] (This doesn't seem to change the overall SHA-256 vs SHA-512 performance question on ARM, when hashing perf matters. SHA-256 on the same chip gets up to 2454.6 MB/s.) In terms of build coverage, for now, we'll have build coverage everywhere and test coverage on Chromium, which runs this code on macOS CI. We should request a macOS ARM64 bot for our standalone CI. Longer term, we need a QEMU-based builder to test various features. QEMU seems to have pretty good coverage of all this, which will at least give us Linux. I haven't added an OPENSSL_STATIC_ARMCAP_SHA512 for now. Instead, we just look at the standard __ARM_FEATURE_SHA512 define. Strangely, the corresponding -march tag is not sha512. Neither GCC and nor Clang have -march=armv8-a+sha512. Instead, -march=armv8-a+sha3 implies both __ARM_FEATURE_SHA3 and __ARM_FEATURE_SHA512! Yet everything else seems to describe the SHA512 extension as separate from SHA3. https://developer.arm.com/architectures/system-architectures/software-standards/acle Update-Note: Consumers with a different build setup may need to limit -D_XOPEN_SOURCE=700 to Linux or non-Apple platforms. Otherwise, won't define some typedef needed by . If you see a build error about u_char, etc., being undefined in some system header, that is probably the cause. Change-Id: Ia213d3796b84c71b7966bb68e0aec92e5d7d26f0 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/50807 Reviewed-by: Adam Langley Commit-Queue: David Benjamin --- BUILDING.md | 35 +++--- CMakeLists.txt | 6 +- crypto/CMakeLists.txt | 1 + crypto/cpu-aarch64-apple.c | 73 ++++++++++++ crypto/cpu-aarch64-fuchsia.c | 5 +- crypto/cpu-aarch64-linux.c | 6 +- crypto/cpu-aarch64-win.c | 4 +- crypto/crypto.c | 3 + crypto/fipsmodule/sha/asm/sha512-armv8.pl | 132 +++++++++++++++++++++- include/openssl/arm_arch.h | 3 + include/openssl/cpu.h | 5 +- util/BUILD.toplevel | 16 ++- 12 files changed, 257 insertions(+), 32 deletions(-) create mode 100644 crypto/cpu-aarch64-apple.c diff --git a/BUILDING.md b/BUILDING.md index 08f004c84..64b152005 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -163,22 +163,17 @@ BoringSSL maintainers if making use of it. don't have steps for assembling the assembly language source files, so they currently cannot be used to build BoringSSL. -## Embedded ARM +## ARM CPU Capabilities -ARM, unlike Intel, does not have an instruction that allows applications to -discover the capabilities of the processor. Instead, the capability information -has to be provided by the operating system somehow. +ARM, unlike Intel, does not have a userspace instruction that allows +applications to discover the capabilities of the processor. Instead, the +capability information has to be provided by a combination of compile-time +information and the operating system. -By default, on Linux-based systems, BoringSSL will try to use `getauxval` and -`/proc` to discover the capabilities. But some environments don't support that -sort of thing and, for them, it's possible to configure the CPU capabilities at -compile time. - -On iOS or builds which define `OPENSSL_STATIC_ARMCAP`, features will be -determined based on the `__ARM_NEON__` and `__ARM_FEATURE_CRYPTO` preprocessor -symbols reported by the compiler. These values are usually controlled by the -`-march` flag. You can also define any of the following to enable the -corresponding ARM feature. +BoringSSL determines capabilities at compile-time based on `__ARM_NEON__`, +`__ARM_FEATURE_CRYPTO`, and other preprocessor symbols reported by the compiler. +These values are usually controlled by the `-march` flag. You can also define +any of the following to enable the corresponding ARM feature. * `OPENSSL_STATIC_ARMCAP_NEON` * `OPENSSL_STATIC_ARMCAP_AES` @@ -186,8 +181,16 @@ corresponding ARM feature. * `OPENSSL_STATIC_ARMCAP_SHA256` * `OPENSSL_STATIC_ARMCAP_PMULL` -Note that if a feature is enabled in this way, but not actually supported at -run-time, BoringSSL will likely crash. +The resulting binary will assume all such features are always present. This can +reduce code size, by allowing the compiler to omit fallbacks. However, if the +feature is not actually supported at runtime, BoringSSL will likely crash. + +BoringSSL will additionally query the operating system at runtime for additional +features, e.g. with `getauxval` on Linux. This allows a single binary to use +newer instructions when present, but still function on CPUs without them. But +some environments don't support runtime queries. If building for those, define +`OPENSSL_STATIC_ARMCAP` to limit BoringSSL to compile-time capabilities. If not +defined, the target operating system must be known to BoringSSL. ## Binary Size diff --git a/CMakeLists.txt b/CMakeLists.txt index f3fc7bcd7..6c70b55f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,8 +257,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11") endif() -# pthread_rwlock_t requires a feature flag. -if(NOT WIN32) +# pthread_rwlock_t on Linux requires a feature flag. However, it should not be +# set on Apple platforms, where it instead disables APIs we use. See compat(5) +# and sys/cdefs.h. +if(NOT WIN32 AND NOT APPLE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_XOPEN_SOURCE=700") endif() diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index d73ce1e3b..31ccfc142 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -263,6 +263,7 @@ add_library( cipher_extra/tls_cbc.c cmac/cmac.c conf/conf.c + cpu-aarch64-apple.c cpu-aarch64-fuchsia.c cpu-aarch64-linux.c cpu-aarch64-win.c diff --git a/crypto/cpu-aarch64-apple.c b/crypto/cpu-aarch64-apple.c new file mode 100644 index 000000000..56012d640 --- /dev/null +++ b/crypto/cpu-aarch64-apple.c @@ -0,0 +1,73 @@ +/* Copyright (c) 2021, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \ + !defined(OPENSSL_STATIC_ARMCAP) + +#include +#include + +#include + +#include "internal.h" + + +extern uint32_t OPENSSL_armcap_P; + +static int has_hw_feature(const char *name) { + int value; + size_t len = sizeof(value); + if (sysctlbyname(name, &value, &len, NULL, 0) != 0) { + return 0; + } + if (len != sizeof(int)) { + // This should not happen. All the values queried should be integer-valued. + assert(0); + return 0; + } + + // Per sys/sysctl.h: + // + // Selectors that return errors are not support on the system. Supported + // features will return 1 if they are recommended or 0 if they are supported + // but are not expected to help performance. Future versions of these + // selectors may return larger values as necessary so it is best to test for + // non zero. + return value != 0; +} + +void OPENSSL_cpuid_setup(void) { + // Apple ARM64 platforms have NEON and cryptography extensions available + // statically, so we do not need to query them. In particular, there sometimes + // are no sysctls corresponding to such features. See below. +#if !defined(__ARM_NEON) || !defined(__ARM_FEATURE_CRYPTO) +#error "NEON and crypto extensions should be statically available." +#endif + OPENSSL_armcap_P = + ARMV7_NEON | ARMV8_AES | ARMV8_PMULL | ARMV8_SHA1 | ARMV8_SHA256; + + // macOS has sysctls named both like "hw.optional.arm.FEAT_SHA512" and like + // "hw.optional.armv8_2_sha512". There does not appear to be documentation on + // which to use. The "armv8_2_sha512" style omits statically-available + // features, while the "FEAT_SHA512" style includes them. However, the + // "FEAT_SHA512" style was added in macOS 12, so we use the older style for + // better compatibility and handle static features above. + if (has_hw_feature("hw.optional.armv8_2_sha512")) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } +} + +#endif // OPENSSL_AARCH64 && OPENSSL_APPLE && !OPENSSL_STATIC_ARMCAP diff --git a/crypto/cpu-aarch64-fuchsia.c b/crypto/cpu-aarch64-fuchsia.c index 98303a033..5c6d115cb 100644 --- a/crypto/cpu-aarch64-fuchsia.c +++ b/crypto/cpu-aarch64-fuchsia.c @@ -50,6 +50,9 @@ void OPENSSL_cpuid_setup(void) { if (hwcap & ZX_ARM64_FEATURE_ISA_SHA2) { OPENSSL_armcap_P |= ARMV8_SHA256; } + // As of writing, Fuchsia does not have a flag for ARMv8.2 SHA-512 + // extensions. When it does, add it here. See + // https://bugs.fuchsia.dev/p/fuchsia/issues/detail?id=90759. } -#endif // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP +#endif // OPENSSL_AARCH64 && OPENSSL_FUCHSIA && !OPENSSL_STATIC_ARMCAP diff --git a/crypto/cpu-aarch64-linux.c b/crypto/cpu-aarch64-linux.c index 0184dd4ff..6ae870a31 100644 --- a/crypto/cpu-aarch64-linux.c +++ b/crypto/cpu-aarch64-linux.c @@ -36,6 +36,7 @@ void OPENSSL_cpuid_setup(void) { static const unsigned long kPMULL = 1 << 4; static const unsigned long kSHA1 = 1 << 5; static const unsigned long kSHA256 = 1 << 6; + static const unsigned long kSHA512 = 1 << 21; if ((hwcap & kNEON) == 0) { // Matching OpenSSL, if NEON is missing, don't report other features @@ -57,6 +58,9 @@ void OPENSSL_cpuid_setup(void) { if (hwcap & kSHA256) { OPENSSL_armcap_P |= ARMV8_SHA256; } + if (hwcap & kSHA512) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } } -#endif // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP +#endif // OPENSSL_AARCH64 && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/crypto/cpu-aarch64-win.c b/crypto/cpu-aarch64-win.c index ee7f8e02a..3d0014ea0 100644 --- a/crypto/cpu-aarch64-win.c +++ b/crypto/cpu-aarch64-win.c @@ -36,6 +36,8 @@ void OPENSSL_cpuid_setup(void) { OPENSSL_armcap_P |= ARMV8_SHA1; OPENSSL_armcap_P |= ARMV8_SHA256; } + // As of writing, Windows does not have a |PF_*| value for ARMv8.2 SHA-512 + // extensions. When it does, add it here. } -#endif +#endif // OPENSSL_AARCH64 && OPENSSL_WINDOWS && !OPENSSL_STATIC_ARMCAP diff --git a/crypto/crypto.c b/crypto/crypto.c index 6886aa4e1..b78b12227 100644 --- a/crypto/crypto.c +++ b/crypto/crypto.c @@ -104,6 +104,9 @@ HIDDEN uint32_t OPENSSL_armcap_P = #endif #if defined(OPENSSL_STATIC_ARMCAP_PMULL) || defined(__ARM_FEATURE_CRYPTO) ARMV8_PMULL | +#endif +#if defined(__ARM_FEATURE_SHA512) + ARMV8_SHA512 | #endif 0; diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl index e96131213..8cb312fac 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl @@ -185,8 +185,6 @@ $code.=<<___; .type $func,%function .align 6 $func: -___ -$code.=<<___ if ($SZ==4); AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 @@ -195,11 +193,17 @@ $code.=<<___ if ($SZ==4); adrp x16,:pg_hi21:OPENSSL_armcap_P #endif ldr w16,[x16,:lo12:OPENSSL_armcap_P] +___ +$code.=<<___ if ($SZ==4); tst w16,#ARMV8_SHA256 b.ne .Lv8_entry -#endif +___ +$code.=<<___ if ($SZ==8); + tst w16,#ARMV8_SHA512 + b.ne .Lv8_entry ___ $code.=<<___; +#endif AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -425,6 +429,110 @@ $code.=<<___; ___ } +if ($SZ==8) { +my $Ktbl="x3"; + +my @H = map("v$_.16b",(0..4)); +my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); +my @MSG=map("v$_.16b",(16..23)); +my ($W0,$W1)=("v24.2d","v25.2d"); +my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); + +$code.=<<___; +.text +#ifndef __KERNEL__ +.type sha512_block_armv8,%function +.align 6 +sha512_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input + ld1 {@MSG[4]-@MSG[7]},[$inp],#64 + + ld1.64 {@H[0]-@H[3]},[$ctx] // load context + adrp $Ktbl,:pg_hi21:.LK512 + add $Ktbl,$Ktbl,:lo12:.LK512 + + rev64 @MSG[0],@MSG[0] + rev64 @MSG[1],@MSG[1] + rev64 @MSG[2],@MSG[2] + rev64 @MSG[3],@MSG[3] + rev64 @MSG[4],@MSG[4] + rev64 @MSG[5],@MSG[5] + rev64 @MSG[6],@MSG[6] + rev64 @MSG[7],@MSG[7] + b .Loop_hw + +.align 4 +.Loop_hw: + ld1.64 {$W0},[$Ktbl],#16 + subs $num,$num,#1 + sub x4,$inp,#128 + orr $AB,@H[0],@H[0] // offload + orr $CD,@H[1],@H[1] + orr $EF,@H[2],@H[2] + orr $GH,@H[3],@H[3] + csel $inp,$inp,x4,ne // conditional rewind +___ +for($i=0;$i<32;$i++) { +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1.64 {$W1},[$Ktbl],#16 + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512su0 @MSG[0],@MSG[1] + ext $m9_10,@MSG[4],@MSG[5],#8 + sha512h @H[3],$fg,$de + sha512su1 @MSG[0],@MSG[7],$m9_10 + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +for(;$i<40;$i++) { +$code.=<<___ if ($i<39); + ld1.64 {$W1},[$Ktbl],#16 +___ +$code.=<<___ if ($i==39); + sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind +___ +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1 {@MSG[0]},[$inp],#16 // load next input + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512h @H[3],$fg,$de + rev64 @MSG[0],@MSG[0] + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +$code.=<<___; + add.i64 @H[0],@H[0],$AB // accumulate + add.i64 @H[1],@H[1],$CD + add.i64 @H[2],@H[2],$EF + add.i64 @H[3],@H[3],$GH + + cbnz $num,.Loop_hw + + st1.64 {@H[0]-@H[3]},[$ctx] // store context + + ldr x29,[sp],#16 + ret +.size sha512_block_armv8,.-sha512_block_armv8 +#endif +___ +} + { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); @@ -440,6 +548,21 @@ ___ } } +{ my %opcode = ( + "sha512h" => 0xce608000, "sha512h2" => 0xce608400, + "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); + + sub unsha512 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + open SELF,$0; while() { next if (/^#!/); @@ -452,12 +575,15 @@ foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; diff --git a/include/openssl/arm_arch.h b/include/openssl/arm_arch.h index 81dc79656..13f5b4aa8 100644 --- a/include/openssl/arm_arch.h +++ b/include/openssl/arm_arch.h @@ -117,6 +117,9 @@ // ARMV8_PMULL indicates support for carryless multiplication. #define ARMV8_PMULL (1 << 5) +// ARMV8_SHA512 indicates support for hardware SHA-512 instructions. +#define ARMV8_SHA512 (1 << 6) + #if defined(__ASSEMBLER__) // Support macros for diff --git a/include/openssl/cpu.h b/include/openssl/cpu.h index 91cf95e14..e71fbecd0 100644 --- a/include/openssl/cpu.h +++ b/include/openssl/cpu.h @@ -105,8 +105,9 @@ OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) { #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) -#if defined(OPENSSL_APPLE) -// iOS builds use the static ARM configuration. +#if defined(OPENSSL_APPLE) && defined(OPENSSL_ARM) +// We do not detect any features at runtime for Apple's 32-bit ARM platforms. On +// 64-bit ARM, we detect some post-ARMv8.0 features. #define OPENSSL_STATIC_ARMCAP #endif diff --git a/util/BUILD.toplevel b/util/BUILD.toplevel index 65e0cdc2e..462a24f68 100644 --- a/util/BUILD.toplevel +++ b/util/BUILD.toplevel @@ -89,9 +89,6 @@ posix_copts = [ # ensure that binaries can be built with non-executable stack. "-Wa,--noexecstack", - # This is needed on Linux systems (at least) to get rwlock in pthread. - "-D_XOPEN_SOURCE=700", - # This list of warnings should match those in the top-level CMakeLists.txt. "-Wall", "-Werror", @@ -108,10 +105,17 @@ posix_copts = [ # "-DOPENSSL_C11_ATOMIC", ] +linux_copts = posix_copts + [ + # This is needed on Linux systems (at least) to get rwlock in pthread, but + # it should not be set on Apple platforms, where it instead disables APIs + # we use. See compat(5) and sys/cdefs.h. + "-D_XOPEN_SOURCE=700", +] + boringssl_copts = select({ - ":linux_aarch64": posix_copts, - ":linux_ppc64le": posix_copts, - ":linux_x86_64": posix_copts, + ":linux_aarch64": linux_copts, + ":linux_ppc64le": linux_copts, + ":linux_x86_64": linux_copts, ":mac_x86_64": posix_copts, ":windows_x86_64": [ "-DWIN32_LEAN_AND_MEAN",