With -march=haswell -DOPENSSL_SMALL=1 on cascadelake: Did 9999 ECDH P-256 operations in 1062469us (9411.1 ops/sec) [+63.5%] Did 25000 ECDSA P-256 signing operations in 1028302us (24311.9 ops/sec) [+48.9%] Did 11004 ECDSA P-256 verify operations in 1072646us (10258.7 ops/sec) [+58.8%] Same configuration measured no performance difference on haswell. The added assembly code occupies 1352 bytes. Change-Id: I42635b7a9bf24d942817976a5d4ce269f642251c Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/63185 Reviewed-by: David Benjamin <davidben@google.com> Commit-Queue: David Benjamin <davidben@google.com>chromium-stable
parent
81ed2b3f6a
commit
20c9406971
6 changed files with 415 additions and 0 deletions
@ -0,0 +1,47 @@ |
||||
/* Copyright (c) 2023, Google Inc.
|
||||
* |
||||
* Permission to use, copy, modify, and/or distribute this software for any |
||||
* purpose with or without fee is hereby granted, provided that the above |
||||
* copyright notice and this permission notice appear in all copies. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
||||
|
||||
#include <gtest/gtest.h> |
||||
#include "../../internal.h" |
||||
#include "../../test/abi_test.h" |
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__GNUC__) && defined(__x86_64__) && \ |
||||
defined(SUPPORTS_ABI_TEST) |
||||
extern "C" { |
||||
#include "../../../third_party/fiat/p256_64.h" |
||||
} |
||||
|
||||
TEST(P256Test, AdxMulABI) { |
||||
static const uint64_t in1[4] = {0}, in2[4] = {0}; |
||||
uint64_t out[4]; |
||||
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && |
||||
CRYPTO_is_ADX_capable()) { |
||||
CHECK_ABI(fiat_p256_adx_mul, out, in1, in2); |
||||
} else { |
||||
GTEST_SKIP() << "Can't test ABI of ADX code without ADX"; |
||||
} |
||||
} |
||||
|
||||
#include <assert.h> |
||||
TEST(P256Test, AdxSquareABI) { |
||||
static const uint64_t in[4] = {0}; |
||||
uint64_t out[4]; |
||||
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && |
||||
CRYPTO_is_ADX_capable()) { |
||||
CHECK_ABI(fiat_p256_adx_sqr, out, in); |
||||
} else { |
||||
GTEST_SKIP() << "Can't test ABI of ADX code without ADX"; |
||||
} |
||||
} |
||||
#endif |
@ -0,0 +1,178 @@ |
||||
#include <openssl/asm_base.h> |
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ |
||||
(defined(__APPLE__) || defined(__ELF__)) |
||||
|
||||
.intel_syntax noprefix
|
||||
.text |
||||
#if defined(__APPLE__) |
||||
.private_extern _fiat_p256_adx_mul
|
||||
.global _fiat_p256_adx_mul
|
||||
_fiat_p256_adx_mul: |
||||
#else |
||||
.type fiat_p256_adx_mul, @function
|
||||
.hidden fiat_p256_adx_mul
|
||||
.global fiat_p256_adx_mul
|
||||
fiat_p256_adx_mul: |
||||
#endif |
||||
|
||||
.cfi_startproc |
||||
_CET_ENDBR |
||||
push rbp |
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset rbp, -16 |
||||
mov rbp, rsp |
||||
mov rax, rdx |
||||
mov rdx, [ rsi + 0x0 ] |
||||
test al, al |
||||
mulx r8, rcx, [ rax + 0x0 ] |
||||
mov [ rsp - 0x80 ], rbx |
||||
.cfi_offset rbx, -16-0x80 |
||||
mulx rbx, r9, [ rax + 0x8 ] |
||||
mov [ rsp - 0x68 ], r14 |
||||
.cfi_offset r14, -16-0x68 |
||||
adc r9, r8 |
||||
mov [ rsp - 0x60 ], r15 |
||||
.cfi_offset r15, -16-0x60 |
||||
mulx r15, r14, [ rax + 0x10 ] |
||||
mov [ rsp - 0x78 ], r12 |
||||
.cfi_offset r12, -16-0x78 |
||||
adc r14, rbx |
||||
mulx r11, r10, [ rax + 0x18 ] |
||||
mov [ rsp - 0x70 ], r13 |
||||
.cfi_offset r13, -16-0x70 |
||||
adc r10, r15 |
||||
mov rdx, [ rsi + 0x8 ] |
||||
mulx rbx, r8, [ rax + 0x0 ] |
||||
adc r11, 0x0 |
||||
xor r15, r15 |
||||
adcx r8, r9 |
||||
adox rbx, r14 |
||||
mov [ rsp - 0x58 ], rdi |
||||
mulx rdi, r9, [ rax + 0x8 ] |
||||
adcx r9, rbx |
||||
adox rdi, r10 |
||||
mulx rbx, r14, [ rax + 0x10 ] |
||||
adcx r14, rdi |
||||
adox rbx, r11 |
||||
mulx r13, r12, [ rax + 0x18 ] |
||||
adcx r12, rbx |
||||
mov rdx, 0x100000000 |
||||
mulx r11, r10, rcx |
||||
adox r13, r15 |
||||
adcx r13, r15 |
||||
xor rdi, rdi |
||||
adox r10, r8 |
||||
mulx r8, rbx, r10 |
||||
adox r11, r9 |
||||
adcx rbx, r11 |
||||
adox r8, r14 |
||||
mov rdx, 0xffffffff00000001 |
||||
mulx r9, r15, rcx |
||||
adcx r15, r8 |
||||
adox r9, r12 |
||||
mulx r14, rcx, r10 |
||||
mov rdx, [ rsi + 0x10 ] |
||||
mulx r10, r12, [ rax + 0x8 ] |
||||
adcx rcx, r9 |
||||
adox r14, r13 |
||||
mulx r11, r13, [ rax + 0x0 ] |
||||
mov r9, rdi |
||||
adcx r14, r9 |
||||
adox rdi, rdi |
||||
adc rdi, 0x0 |
||||
xor r9, r9 |
||||
adcx r13, rbx |
||||
adox r11, r15 |
||||
mov rdx, [ rsi + 0x10 ] |
||||
mulx r15, r8, [ rax + 0x10 ] |
||||
adox r10, rcx |
||||
mulx rcx, rbx, [ rax + 0x18 ] |
||||
mov rdx, [ rsi + 0x18 ] |
||||
adcx r12, r11 |
||||
mulx rsi, r11, [ rax + 0x8 ] |
||||
adcx r8, r10 |
||||
adox r15, r14 |
||||
adcx rbx, r15 |
||||
adox rcx, r9 |
||||
adcx rcx, r9 |
||||
mulx r15, r10, [ rax + 0x0 ] |
||||
add rcx, rdi |
||||
mov r14, r9 |
||||
adc r14, 0 |
||||
xor r9, r9 |
||||
adcx r10, r12 |
||||
adox r15, r8 |
||||
adcx r11, r15 |
||||
adox rsi, rbx |
||||
mulx r8, r12, [ rax + 0x10 ] |
||||
adox r8, rcx |
||||
mulx rcx, rbx, [ rax + 0x18 ] |
||||
adcx r12, rsi |
||||
adox rcx, r9 |
||||
mov rdx, 0x100000000 |
||||
adcx rbx, r8 |
||||
adc rcx, 0 |
||||
mulx rdi, r15, r13 |
||||
xor rax, rax |
||||
adcx rcx, r14 |
||||
adc rax, 0 |
||||
xor r9, r9 |
||||
adox r15, r10 |
||||
mulx r14, r10, r15 |
||||
adox rdi, r11 |
||||
mov rdx, 0xffffffff00000001 |
||||
adox r14, r12 |
||||
adcx r10, rdi |
||||
mulx r12, r11, r13 |
||||
adcx r11, r14 |
||||
adox r12, rbx |
||||
mulx rbx, r13, r15 |
||||
adcx r13, r12 |
||||
adox rbx, rcx |
||||
mov r8, r9 |
||||
adox rax, r9 |
||||
adcx r8, rbx |
||||
adc rax, 0x0 |
||||
mov rcx, rax |
||||
mov r15, 0xffffffffffffffff |
||||
mov rdi, r10 |
||||
sub rdi, r15 |
||||
mov r14, 0xffffffff |
||||
mov r12, r11 |
||||
sbb r12, r14 |
||||
mov rbx, r13 |
||||
sbb rbx, r9 |
||||
mov rax, rax |
||||
mov rax, r8 |
||||
sbb rax, rdx |
||||
sbb rcx, r9 |
||||
cmovc rdi, r10 |
||||
mov r10, [ rsp - 0x58 ] |
||||
cmovc rbx, r13 |
||||
mov r13, [ rsp - 0x70 ] |
||||
.cfi_restore r13
|
||||
cmovc r12, r11 |
||||
cmovc rax, r8 |
||||
mov [ r10 + 0x10 ], rbx |
||||
mov rbx, [ rsp - 0x80 ] |
||||
.cfi_restore rbx
|
||||
mov [ r10 + 0x0 ], rdi |
||||
mov [ r10 + 0x8 ], r12 |
||||
mov [ r10 + 0x18 ], rax |
||||
mov r12, [ rsp - 0x78 ] |
||||
.cfi_restore r12
|
||||
mov r14, [ rsp - 0x68 ] |
||||
.cfi_restore r14
|
||||
mov r15, [ rsp - 0x60 ] |
||||
.cfi_restore r15
|
||||
pop rbp |
||||
.cfi_restore rbp
|
||||
.cfi_adjust_cfa_offset -8 |
||||
ret |
||||
.cfi_endproc |
||||
#if defined(__ELF__) |
||||
.size fiat_p256_adx_mul, .-fiat_p256_adx_mul |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,167 @@ |
||||
#include <openssl/asm_base.h> |
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ |
||||
(defined(__APPLE__) || defined(__ELF__)) |
||||
|
||||
.intel_syntax noprefix
|
||||
.text |
||||
#if defined(__APPLE__) |
||||
.private_extern _fiat_p256_adx_sqr
|
||||
.global _fiat_p256_adx_sqr
|
||||
_fiat_p256_adx_sqr: |
||||
#else |
||||
.type fiat_p256_adx_sqr, @function
|
||||
.hidden fiat_p256_adx_sqr
|
||||
.global fiat_p256_adx_sqr
|
||||
fiat_p256_adx_sqr: |
||||
#endif |
||||
|
||||
.cfi_startproc |
||||
_CET_ENDBR |
||||
push rbp |
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_offset rbp, -16 |
||||
mov rbp, rsp |
||||
mov rdx, [ rsi + 0x0 ] |
||||
mulx r10, rax, [ rsi + 0x18 ] |
||||
mulx rcx, r11, rdx |
||||
mulx r9, r8, [ rsi + 0x8 ] |
||||
mov [ rsp - 0x80 ], rbx |
||||
.cfi_offset rbx, -16-0x80 |
||||
xor rbx, rbx |
||||
adox r8, r8 |
||||
mov [ rsp - 0x78 ], r12 |
||||
.cfi_offset r12, -16-0x78 |
||||
mulx r12, rbx, [ rsi + 0x10 ] |
||||
mov rdx, [ rsi + 0x8 ] |
||||
mov [ rsp - 0x70 ], r13 |
||||
.cfi_offset r13, -16-0x70 |
||||
mov [ rsp - 0x68 ], r14 |
||||
.cfi_offset r14, -16-0x68 |
||||
mulx r14, r13, rdx |
||||
mov [ rsp - 0x60 ], r15 |
||||
.cfi_offset r15, -16-0x60 |
||||
mov [ rsp - 0x58 ], rdi |
||||
mulx rdi, r15, [ rsi + 0x10 ] |
||||
adcx r12, r15 |
||||
mov [ rsp - 0x50 ], r11 |
||||
mulx r11, r15, [ rsi + 0x18 ] |
||||
adcx r10, rdi |
||||
mov rdi, 0x0 |
||||
adcx r11, rdi |
||||
clc |
||||
adcx rbx, r9 |
||||
adox rbx, rbx |
||||
adcx rax, r12 |
||||
adox rax, rax |
||||
adcx r15, r10 |
||||
adox r15, r15 |
||||
mov rdx, [ rsi + 0x10 ] |
||||
mulx r12, r9, [ rsi + 0x18 ] |
||||
adcx r9, r11 |
||||
adcx r12, rdi |
||||
mulx r11, r10, rdx |
||||
clc |
||||
adcx rcx, r8 |
||||
adcx r13, rbx |
||||
adcx r14, rax |
||||
adox r9, r9 |
||||
adcx r10, r15 |
||||
mov rdx, [ rsi + 0x18 ] |
||||
mulx rbx, r8, rdx |
||||
adox r12, r12 |
||||
adcx r11, r9 |
||||
mov rsi, [ rsp - 0x50 ] |
||||
adcx r8, r12 |
||||
mov rax, 0x100000000 |
||||
mov rdx, rax |
||||
mulx r15, rax, rsi |
||||
adcx rbx, rdi |
||||
adox rbx, rdi |
||||
xor r9, r9 |
||||
adox rax, rcx |
||||
adox r15, r13 |
||||
mulx rcx, rdi, rax |
||||
adcx rdi, r15 |
||||
adox rcx, r14 |
||||
mov rdx, 0xffffffff00000001 |
||||
mulx r14, r13, rsi |
||||
adox r14, r10 |
||||
adcx r13, rcx |
||||
mulx r12, r10, rax |
||||
adox r12, r11 |
||||
mov r11, r9 |
||||
adox r11, r8 |
||||
adcx r10, r14 |
||||
mov r8, r9 |
||||
adcx r8, r12 |
||||
mov rax, r9 |
||||
adcx rax, r11 |
||||
mov r15, r9 |
||||
adox r15, rbx |
||||
mov rdx, 0x100000000 |
||||
mulx rcx, rbx, rdi |
||||
mov r14, r9 |
||||
adcx r14, r15 |
||||
mov r12, r9 |
||||
adox r12, r12 |
||||
adcx r12, r9 |
||||
adox rbx, r13 |
||||
mulx r11, r13, rbx |
||||
mov r15, 0xffffffff00000001 |
||||
mov rdx, r15 |
||||
mulx rsi, r15, rbx |
||||
adox rcx, r10 |
||||
adox r11, r8 |
||||
mulx r8, r10, rdi |
||||
adcx r13, rcx |
||||
adox r8, rax |
||||
adcx r10, r11 |
||||
adox rsi, r14 |
||||
mov rdi, r12 |
||||
mov rax, r9 |
||||
adox rdi, rax |
||||
adcx r15, r8 |
||||
mov r14, rax |
||||
adcx r14, rsi |
||||
adcx rdi, r9 |
||||
dec r9 |
||||
mov rbx, r13 |
||||
sub rbx, r9 |
||||
mov rcx, 0xffffffff |
||||
mov r11, r10 |
||||
sbb r11, rcx |
||||
mov r8, r15 |
||||
sbb r8, rax |
||||
mov rsi, r14 |
||||
sbb rsi, rdx |
||||
sbb rdi, rax |
||||
cmovc rbx, r13 |
||||
cmovc r8, r15 |
||||
cmovc r11, r10 |
||||
cmovc rsi, r14 |
||||
mov rdi, [ rsp - 0x58 ] |
||||
mov [ rdi + 0x18 ], rsi |
||||
mov [ rdi + 0x0 ], rbx |
||||
mov [ rdi + 0x8 ], r11 |
||||
mov [ rdi + 0x10 ], r8 |
||||
mov rbx, [ rsp - 0x80 ] |
||||
.cfi_restore rbx
|
||||
mov r12, [ rsp - 0x78 ] |
||||
.cfi_restore r12
|
||||
mov r13, [ rsp - 0x70 ] |
||||
.cfi_restore r13
|
||||
mov r14, [ rsp - 0x68 ] |
||||
.cfi_restore r14
|
||||
mov r15, [ rsp - 0x60 ] |
||||
.cfi_restore r15
|
||||
pop rbp |
||||
.cfi_restore rbp
|
||||
.cfi_adjust_cfa_offset -8 |
||||
ret |
||||
.cfi_endproc |
||||
#if defined(__ELF__) |
||||
.size fiat_p256_adx_sqr, .-fiat_p256_adx_sqr |
||||
#endif |
||||
|
||||
#endif |
Loading…
Reference in new issue