Add table-independent x86+adx asm for P-256

With -march=haswell -DOPENSSL_SMALL=1 on cascadelake:
Did 9999 ECDH P-256 operations in 1062469us (9411.1 ops/sec) [+63.5%]
Did 25000 ECDSA P-256 signing operations in 1028302us (24311.9 ops/sec) [+48.9%]
Did 11004 ECDSA P-256 verify operations in 1072646us (10258.7 ops/sec) [+58.8%]

Same configuration measured no performance difference on haswell.

The added assembly code occupies 1352 bytes.

Change-Id: I42635b7a9bf24d942817976a5d4ce269f642251c
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/63185
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
chromium-stable
Andres Erbsen 1 year ago committed by Boringssl LUCI CQ
parent 81ed2b3f6a
commit 20c9406971
  1. 2
      crypto/CMakeLists.txt
  2. 47
      crypto/fipsmodule/ec/p256_test.cc
  3. 1
      sources.cmake
  4. 178
      third_party/fiat/asm/fiat_p256_adx_mul.S
  5. 167
      third_party/fiat/asm/fiat_p256_adx_sqr.S
  6. 20
      third_party/fiat/p256_64.h

@ -18,6 +18,8 @@ set(
poly1305/poly1305_arm_asm.S
../third_party/fiat/asm/fiat_curve25519_adx_mul.S
../third_party/fiat/asm/fiat_curve25519_adx_square.S
../third_party/fiat/asm/fiat_p256_adx_mul.S
../third_party/fiat/asm/fiat_p256_adx_sqr.S
)
perlasm(CRYPTO_SOURCES aarch64 chacha/chacha-armv8 chacha/asm/chacha-armv8.pl)
perlasm(CRYPTO_SOURCES aarch64 cipher_extra/chacha20_poly1305_armv8 cipher_extra/asm/chacha20_poly1305_armv8.pl)

@ -0,0 +1,47 @@
/* Copyright (c) 2023, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include <gtest/gtest.h>
#include "../../internal.h"
#include "../../test/abi_test.h"
#if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__) && \
defined(SUPPORTS_ABI_TEST)
extern "C" {
#include "../../../third_party/fiat/p256_64.h"
}
TEST(P256Test, AdxMulABI) {
static const uint64_t in1[4] = {0}, in2[4] = {0};
uint64_t out[4];
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_ADX_capable()) {
CHECK_ABI(fiat_p256_adx_mul, out, in1, in2);
} else {
GTEST_SKIP() << "Can't test ABI of ADX code without ADX";
}
}
#include <assert.h>
TEST(P256Test, AdxSquareABI) {
static const uint64_t in[4] = {0};
uint64_t out[4];
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_ADX_capable()) {
CHECK_ABI(fiat_p256_adx_sqr, out, in);
} else {
GTEST_SKIP() << "Can't test ABI of ADX code without ADX";
}
}
#endif

@ -38,6 +38,7 @@ set(
crypto/fipsmodule/cmac/cmac_test.cc
crypto/fipsmodule/ec/ec_test.cc
crypto/fipsmodule/ec/p256-nistz_test.cc
crypto/fipsmodule/ec/p256_test.cc
crypto/fipsmodule/ecdsa/ecdsa_test.cc
crypto/fipsmodule/hkdf/hkdf_test.cc
crypto/fipsmodule/md5/md5_test.cc

@ -0,0 +1,178 @@
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
(defined(__APPLE__) || defined(__ELF__))
.intel_syntax noprefix
.text
#if defined(__APPLE__)
.private_extern _fiat_p256_adx_mul
.global _fiat_p256_adx_mul
_fiat_p256_adx_mul:
#else
.type fiat_p256_adx_mul, @function
.hidden fiat_p256_adx_mul
.global fiat_p256_adx_mul
fiat_p256_adx_mul:
#endif
.cfi_startproc
_CET_ENDBR
push rbp
.cfi_adjust_cfa_offset 8
.cfi_offset rbp, -16
mov rbp, rsp
mov rax, rdx
mov rdx, [ rsi + 0x0 ]
test al, al
mulx r8, rcx, [ rax + 0x0 ]
mov [ rsp - 0x80 ], rbx
.cfi_offset rbx, -16-0x80
mulx rbx, r9, [ rax + 0x8 ]
mov [ rsp - 0x68 ], r14
.cfi_offset r14, -16-0x68
adc r9, r8
mov [ rsp - 0x60 ], r15
.cfi_offset r15, -16-0x60
mulx r15, r14, [ rax + 0x10 ]
mov [ rsp - 0x78 ], r12
.cfi_offset r12, -16-0x78
adc r14, rbx
mulx r11, r10, [ rax + 0x18 ]
mov [ rsp - 0x70 ], r13
.cfi_offset r13, -16-0x70
adc r10, r15
mov rdx, [ rsi + 0x8 ]
mulx rbx, r8, [ rax + 0x0 ]
adc r11, 0x0
xor r15, r15
adcx r8, r9
adox rbx, r14
mov [ rsp - 0x58 ], rdi
mulx rdi, r9, [ rax + 0x8 ]
adcx r9, rbx
adox rdi, r10
mulx rbx, r14, [ rax + 0x10 ]
adcx r14, rdi
adox rbx, r11
mulx r13, r12, [ rax + 0x18 ]
adcx r12, rbx
mov rdx, 0x100000000
mulx r11, r10, rcx
adox r13, r15
adcx r13, r15
xor rdi, rdi
adox r10, r8
mulx r8, rbx, r10
adox r11, r9
adcx rbx, r11
adox r8, r14
mov rdx, 0xffffffff00000001
mulx r9, r15, rcx
adcx r15, r8
adox r9, r12
mulx r14, rcx, r10
mov rdx, [ rsi + 0x10 ]
mulx r10, r12, [ rax + 0x8 ]
adcx rcx, r9
adox r14, r13
mulx r11, r13, [ rax + 0x0 ]
mov r9, rdi
adcx r14, r9
adox rdi, rdi
adc rdi, 0x0
xor r9, r9
adcx r13, rbx
adox r11, r15
mov rdx, [ rsi + 0x10 ]
mulx r15, r8, [ rax + 0x10 ]
adox r10, rcx
mulx rcx, rbx, [ rax + 0x18 ]
mov rdx, [ rsi + 0x18 ]
adcx r12, r11
mulx rsi, r11, [ rax + 0x8 ]
adcx r8, r10
adox r15, r14
adcx rbx, r15
adox rcx, r9
adcx rcx, r9
mulx r15, r10, [ rax + 0x0 ]
add rcx, rdi
mov r14, r9
adc r14, 0
xor r9, r9
adcx r10, r12
adox r15, r8
adcx r11, r15
adox rsi, rbx
mulx r8, r12, [ rax + 0x10 ]
adox r8, rcx
mulx rcx, rbx, [ rax + 0x18 ]
adcx r12, rsi
adox rcx, r9
mov rdx, 0x100000000
adcx rbx, r8
adc rcx, 0
mulx rdi, r15, r13
xor rax, rax
adcx rcx, r14
adc rax, 0
xor r9, r9
adox r15, r10
mulx r14, r10, r15
adox rdi, r11
mov rdx, 0xffffffff00000001
adox r14, r12
adcx r10, rdi
mulx r12, r11, r13
adcx r11, r14
adox r12, rbx
mulx rbx, r13, r15
adcx r13, r12
adox rbx, rcx
mov r8, r9
adox rax, r9
adcx r8, rbx
adc rax, 0x0
mov rcx, rax
mov r15, 0xffffffffffffffff
mov rdi, r10
sub rdi, r15
mov r14, 0xffffffff
mov r12, r11
sbb r12, r14
mov rbx, r13
sbb rbx, r9
mov rax, rax
mov rax, r8
sbb rax, rdx
sbb rcx, r9
cmovc rdi, r10
mov r10, [ rsp - 0x58 ]
cmovc rbx, r13
mov r13, [ rsp - 0x70 ]
.cfi_restore r13
cmovc r12, r11
cmovc rax, r8
mov [ r10 + 0x10 ], rbx
mov rbx, [ rsp - 0x80 ]
.cfi_restore rbx
mov [ r10 + 0x0 ], rdi
mov [ r10 + 0x8 ], r12
mov [ r10 + 0x18 ], rax
mov r12, [ rsp - 0x78 ]
.cfi_restore r12
mov r14, [ rsp - 0x68 ]
.cfi_restore r14
mov r15, [ rsp - 0x60 ]
.cfi_restore r15
pop rbp
.cfi_restore rbp
.cfi_adjust_cfa_offset -8
ret
.cfi_endproc
#if defined(__ELF__)
.size fiat_p256_adx_mul, .-fiat_p256_adx_mul
#endif
#endif

@ -0,0 +1,167 @@
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
(defined(__APPLE__) || defined(__ELF__))
.intel_syntax noprefix
.text
#if defined(__APPLE__)
.private_extern _fiat_p256_adx_sqr
.global _fiat_p256_adx_sqr
_fiat_p256_adx_sqr:
#else
.type fiat_p256_adx_sqr, @function
.hidden fiat_p256_adx_sqr
.global fiat_p256_adx_sqr
fiat_p256_adx_sqr:
#endif
.cfi_startproc
_CET_ENDBR
push rbp
.cfi_adjust_cfa_offset 8
.cfi_offset rbp, -16
mov rbp, rsp
mov rdx, [ rsi + 0x0 ]
mulx r10, rax, [ rsi + 0x18 ]
mulx rcx, r11, rdx
mulx r9, r8, [ rsi + 0x8 ]
mov [ rsp - 0x80 ], rbx
.cfi_offset rbx, -16-0x80
xor rbx, rbx
adox r8, r8
mov [ rsp - 0x78 ], r12
.cfi_offset r12, -16-0x78
mulx r12, rbx, [ rsi + 0x10 ]
mov rdx, [ rsi + 0x8 ]
mov [ rsp - 0x70 ], r13
.cfi_offset r13, -16-0x70
mov [ rsp - 0x68 ], r14
.cfi_offset r14, -16-0x68
mulx r14, r13, rdx
mov [ rsp - 0x60 ], r15
.cfi_offset r15, -16-0x60
mov [ rsp - 0x58 ], rdi
mulx rdi, r15, [ rsi + 0x10 ]
adcx r12, r15
mov [ rsp - 0x50 ], r11
mulx r11, r15, [ rsi + 0x18 ]
adcx r10, rdi
mov rdi, 0x0
adcx r11, rdi
clc
adcx rbx, r9
adox rbx, rbx
adcx rax, r12
adox rax, rax
adcx r15, r10
adox r15, r15
mov rdx, [ rsi + 0x10 ]
mulx r12, r9, [ rsi + 0x18 ]
adcx r9, r11
adcx r12, rdi
mulx r11, r10, rdx
clc
adcx rcx, r8
adcx r13, rbx
adcx r14, rax
adox r9, r9
adcx r10, r15
mov rdx, [ rsi + 0x18 ]
mulx rbx, r8, rdx
adox r12, r12
adcx r11, r9
mov rsi, [ rsp - 0x50 ]
adcx r8, r12
mov rax, 0x100000000
mov rdx, rax
mulx r15, rax, rsi
adcx rbx, rdi
adox rbx, rdi
xor r9, r9
adox rax, rcx
adox r15, r13
mulx rcx, rdi, rax
adcx rdi, r15
adox rcx, r14
mov rdx, 0xffffffff00000001
mulx r14, r13, rsi
adox r14, r10
adcx r13, rcx
mulx r12, r10, rax
adox r12, r11
mov r11, r9
adox r11, r8
adcx r10, r14
mov r8, r9
adcx r8, r12
mov rax, r9
adcx rax, r11
mov r15, r9
adox r15, rbx
mov rdx, 0x100000000
mulx rcx, rbx, rdi
mov r14, r9
adcx r14, r15
mov r12, r9
adox r12, r12
adcx r12, r9
adox rbx, r13
mulx r11, r13, rbx
mov r15, 0xffffffff00000001
mov rdx, r15
mulx rsi, r15, rbx
adox rcx, r10
adox r11, r8
mulx r8, r10, rdi
adcx r13, rcx
adox r8, rax
adcx r10, r11
adox rsi, r14
mov rdi, r12
mov rax, r9
adox rdi, rax
adcx r15, r8
mov r14, rax
adcx r14, rsi
adcx rdi, r9
dec r9
mov rbx, r13
sub rbx, r9
mov rcx, 0xffffffff
mov r11, r10
sbb r11, rcx
mov r8, r15
sbb r8, rax
mov rsi, r14
sbb rsi, rdx
sbb rdi, rax
cmovc rbx, r13
cmovc r8, r15
cmovc r11, r10
cmovc rsi, r14
mov rdi, [ rsp - 0x58 ]
mov [ rdi + 0x18 ], rsi
mov [ rdi + 0x0 ], rbx
mov [ rdi + 0x8 ], r11
mov [ rdi + 0x10 ], r8
mov rbx, [ rsp - 0x80 ]
.cfi_restore rbx
mov r12, [ rsp - 0x78 ]
.cfi_restore r12
mov r13, [ rsp - 0x70 ]
.cfi_restore r13
mov r14, [ rsp - 0x68 ]
.cfi_restore r14
mov r15, [ rsp - 0x60 ]
.cfi_restore r15
pop rbp
.cfi_restore rbp
.cfi_adjust_cfa_offset -8
ret
.cfi_endproc
#if defined(__ELF__)
.size fiat_p256_adx_sqr, .-fiat_p256_adx_sqr
#endif
#endif

@ -1,3 +1,9 @@
#include "../../crypto/internal.h"
#if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
void fiat_p256_adx_mul(uint64_t*, const uint64_t*, const uint64_t*);
void fiat_p256_adx_sqr(uint64_t*, const uint64_t*);
#endif
/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
/* curve description: p256 */
/* machine_wordsize = 64 (from "64") */
@ -165,6 +171,13 @@ static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p25
*
*/
static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
#if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_ADX_capable()) {
fiat_p256_adx_mul(out1, arg1, arg2);
return;
}
#endif
uint64_t x1;
uint64_t x2;
uint64_t x3;
@ -472,6 +485,13 @@ static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_fiel
*
*/
static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
#if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__)
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_ADX_capable()) {
fiat_p256_adx_sqr(out1, arg1);
return;
}
#endif
uint64_t x1;
uint64_t x2;
uint64_t x3;

Loading…
Cancel
Save