|
|
|
/* Copyright (c) 2018, Google Inc.
|
|
|
|
* Copyright (c) 2020, Arm Ltd.
|
|
|
|
*
|
|
|
|
* Permission to use, copy, modify, and/or distribute this software for any
|
|
|
|
* purpose with or without fee is hereby granted, provided that the above
|
|
|
|
* copyright notice and this permission notice appear in all copies.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
|
|
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|
|
|
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
|
|
|
|
|
|
|
#include "internal.h"
|
|
|
|
|
|
|
|
#if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \
|
|
|
|
!defined(OPENSSL_STATIC_ARMCAP)
|
|
|
|
|
|
|
|
#include <windows.h>
|
|
|
|
|
|
|
|
#include <openssl/arm_arch.h>
|
|
|
|
|
|
|
|
extern uint32_t OPENSSL_armcap_P;
|
|
|
|
void OPENSSL_cpuid_setup(void) {
|
|
|
|
// We do not need to check for the presence of NEON, as Armv8-A always has it
|
|
|
|
OPENSSL_armcap_P |= ARMV7_NEON;
|
|
|
|
|
|
|
|
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) {
|
|
|
|
// These are all covered by one call in Windows
|
|
|
|
OPENSSL_armcap_P |= ARMV8_AES;
|
|
|
|
OPENSSL_armcap_P |= ARMV8_PMULL;
|
|
|
|
OPENSSL_armcap_P |= ARMV8_SHA1;
|
|
|
|
OPENSSL_armcap_P |= ARMV8_SHA256;
|
|
|
|
}
|
Enable SHA-512 ARM acceleration when available.
This imports the changes to sha512-armv8.pl from
upstream's af0fcf7b4668218b24d9250b95e0b96939ccb4d1.
Tweaks needed:
- Add an explicit .text because we put .LK$BITS in .rodata for XOM
- .LK$bits and code are in separate sections, so use adrp/add instead of
plain adr
- Where glibc needs feature flags to *enable* pthread_rwlock, Apple
interprets _XOPEN_SOURCE as a request to *disable* Apple extensions.
Tighten the condition on the _XOPEN_SOURCE check.
Added support for macOS and Linux, tested manually on an ARM Mac and a
VM, respectively. Fuchsia and Windows do not currently have APIs to
expose this bit, so I've left in TODOs. Benchmarks from an Apple M1 Max:
Before:
Did 4647000 SHA-512 (16 bytes) operations in 1000103us (74.3 MB/sec)
Did 1614000 SHA-512 (256 bytes) operations in 1000379us (413.0 MB/sec)
Did 439000 SHA-512 (1350 bytes) operations in 1001694us (591.6 MB/sec)
Did 76000 SHA-512 (8192 bytes) operations in 1011821us (615.3 MB/sec)
Did 39000 SHA-512 (16384 bytes) operations in 1024311us (623.8 MB/sec)
After:
Did 10369000 SHA-512 (16 bytes) operations in 1000088us (165.9 MB/sec) [+123.1%]
Did 3650000 SHA-512 (256 bytes) operations in 1000079us (934.3 MB/sec) [+126.2%]
Did 1029000 SHA-512 (1350 bytes) operations in 1000521us (1388.4 MB/sec) [+134.7%]
Did 175000 SHA-512 (8192 bytes) operations in 1001874us (1430.9 MB/sec) [+132.5%]
Did 89000 SHA-512 (16384 bytes) operations in 1010314us (1443.3 MB/sec) [+131.4%]
(This doesn't seem to change the overall SHA-256 vs SHA-512 performance
question on ARM, when hashing perf matters. SHA-256 on the same chip
gets up to 2454.6 MB/s.)
In terms of build coverage, for now, we'll have build coverage
everywhere and test coverage on Chromium, which runs this code on macOS
CI. We should request a macOS ARM64 bot for our standalone CI. Longer
term, we need a QEMU-based builder to test various features. QEMU seems
to have pretty good coverage of all this, which will at least give us
Linux.
I haven't added an OPENSSL_STATIC_ARMCAP_SHA512 for now. Instead, we
just look at the standard __ARM_FEATURE_SHA512 define. Strangely, the
corresponding -march tag is not sha512. Neither GCC and nor Clang have
-march=armv8-a+sha512. Instead, -march=armv8-a+sha3 implies both
__ARM_FEATURE_SHA3 and __ARM_FEATURE_SHA512! Yet everything else seems
to describe the SHA512 extension as separate from SHA3.
https://developer.arm.com/architectures/system-architectures/software-standards/acle
Update-Note: Consumers with a different build setup may need to
limit -D_XOPEN_SOURCE=700 to Linux or non-Apple platforms. Otherwise,
<sys/types.h> won't define some typedef needed by <sys/sysctl.h>. If you
see a build error about u_char, etc., being undefined in some system
header, that is probably the cause.
Change-Id: Ia213d3796b84c71b7966bb68e0aec92e5d7d26f0
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/50807
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
3 years ago
|
|
|
// As of writing, Windows does not have a |PF_*| value for ARMv8.2 SHA-512
|
|
|
|
// extensions. When it does, add it here.
|
|
|
|
}
|
|
|
|
|
Enable SHA-512 ARM acceleration when available.
This imports the changes to sha512-armv8.pl from
upstream's af0fcf7b4668218b24d9250b95e0b96939ccb4d1.
Tweaks needed:
- Add an explicit .text because we put .LK$BITS in .rodata for XOM
- .LK$bits and code are in separate sections, so use adrp/add instead of
plain adr
- Where glibc needs feature flags to *enable* pthread_rwlock, Apple
interprets _XOPEN_SOURCE as a request to *disable* Apple extensions.
Tighten the condition on the _XOPEN_SOURCE check.
Added support for macOS and Linux, tested manually on an ARM Mac and a
VM, respectively. Fuchsia and Windows do not currently have APIs to
expose this bit, so I've left in TODOs. Benchmarks from an Apple M1 Max:
Before:
Did 4647000 SHA-512 (16 bytes) operations in 1000103us (74.3 MB/sec)
Did 1614000 SHA-512 (256 bytes) operations in 1000379us (413.0 MB/sec)
Did 439000 SHA-512 (1350 bytes) operations in 1001694us (591.6 MB/sec)
Did 76000 SHA-512 (8192 bytes) operations in 1011821us (615.3 MB/sec)
Did 39000 SHA-512 (16384 bytes) operations in 1024311us (623.8 MB/sec)
After:
Did 10369000 SHA-512 (16 bytes) operations in 1000088us (165.9 MB/sec) [+123.1%]
Did 3650000 SHA-512 (256 bytes) operations in 1000079us (934.3 MB/sec) [+126.2%]
Did 1029000 SHA-512 (1350 bytes) operations in 1000521us (1388.4 MB/sec) [+134.7%]
Did 175000 SHA-512 (8192 bytes) operations in 1001874us (1430.9 MB/sec) [+132.5%]
Did 89000 SHA-512 (16384 bytes) operations in 1010314us (1443.3 MB/sec) [+131.4%]
(This doesn't seem to change the overall SHA-256 vs SHA-512 performance
question on ARM, when hashing perf matters. SHA-256 on the same chip
gets up to 2454.6 MB/s.)
In terms of build coverage, for now, we'll have build coverage
everywhere and test coverage on Chromium, which runs this code on macOS
CI. We should request a macOS ARM64 bot for our standalone CI. Longer
term, we need a QEMU-based builder to test various features. QEMU seems
to have pretty good coverage of all this, which will at least give us
Linux.
I haven't added an OPENSSL_STATIC_ARMCAP_SHA512 for now. Instead, we
just look at the standard __ARM_FEATURE_SHA512 define. Strangely, the
corresponding -march tag is not sha512. Neither GCC and nor Clang have
-march=armv8-a+sha512. Instead, -march=armv8-a+sha3 implies both
__ARM_FEATURE_SHA3 and __ARM_FEATURE_SHA512! Yet everything else seems
to describe the SHA512 extension as separate from SHA3.
https://developer.arm.com/architectures/system-architectures/software-standards/acle
Update-Note: Consumers with a different build setup may need to
limit -D_XOPEN_SOURCE=700 to Linux or non-Apple platforms. Otherwise,
<sys/types.h> won't define some typedef needed by <sys/sysctl.h>. If you
see a build error about u_char, etc., being undefined in some system
header, that is probably the cause.
Change-Id: Ia213d3796b84c71b7966bb68e0aec92e5d7d26f0
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/50807
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
3 years ago
|
|
|
#endif // OPENSSL_AARCH64 && OPENSSL_WINDOWS && !OPENSSL_STATIC_ARMCAP
|