sha: Move Armv7 dispatching to C

sha256_block_data_order_hw required a bit of wrestling with Arm
immediate limits. PC-relative addressing in 32-bit Arm is a huge mess.
I think I could have avoided the extra load with a lot of effort
(convincing the assembler to evaluate a messy expression), but this is
simpler and there was no measurable performance difference.

Change-Id: I3fab4abc0fa24e0d689581e2c9b9faaa32bd7442
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64749
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
chromium-stable
David Benjamin 11 months ago committed by Boringssl LUCI CQ
parent fcec1397a4
commit 62f43f5ea5
  1. 41
      crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
  2. 61
      crypto/fipsmodule/sha/asm/sha256-armv4.pl
  3. 35
      crypto/fipsmodule/sha/asm/sha512-armv4.pl
  4. 32
      crypto/fipsmodule/sha/internal.h
  5. 6
      crypto/fipsmodule/sha/sha1.c
  6. 6
      crypto/fipsmodule/sha/sha256.c
  7. 6
      crypto/fipsmodule/sha/sha512.c
  8. 15
      crypto/fipsmodule/sha/sha_test.cc

@ -197,24 +197,11 @@ $code=<<___;
.code 32
#endif
.global sha1_block_data_order
.type sha1_block_data_order,%function
.global sha1_block_data_order_nohw
.type sha1_block_data_order_nohw,%function
.align 5
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
.Lsha1_block:
adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
sha1_block_data_order_nohw:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
@ -304,17 +291,13 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha1_block_data_order,.-sha1_block_data_order
.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha1_block
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
@ -530,10 +513,10 @@ $code.=<<___;
.arch armv7-a
.fpu neon
.global sha1_block_data_order_neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ -625,10 +608,10 @@ $code.=<<___;
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
.type sha1_block_data_order_armv8,%function
.global sha1_block_data_order_hw
.type sha1_block_data_order_hw,%function
.align 5
sha1_block_data_order_armv8:
.LARMv8:
sha1_block_data_order_hw:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
@ -693,16 +676,10 @@ $code.=<<___;
vldmia sp!,{d8-d15}
ret @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
#endif
___
}}}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,

@ -217,34 +217,15 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
.Lsha256_block_data_order:
adr r3,.Lsha256_block_data_order
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
.global sha256_block_data_order_nohw
.type sha256_block_data_order_nohw,%function
sha256_block_data_order_nohw:
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
@ replace this with a simple ADR.
sub $Ktbl,r3,#256+32 @ K256
adr $Ktbl,K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH>=7
@ -298,7 +279,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
___
######################################################################
# NEON stuff
@ -483,10 +464,12 @@ $code.=<<___;
.align 5
.skip 16
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
@ In Arm mode, the following ADR runs up against the limits of encodable
@ offsets. It only fits because the offset, when the ADR is placed here,
@ is a multiple of 16.
adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
@ -613,12 +596,26 @@ $code.=<<___;
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.LK256_shortcut:
@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
#if defined(__thumb2__)
.word K256-(.LK256_add+4)
#else
.word K256-(.LK256_add+8)
#endif
.global sha256_block_data_order_hw
.type sha256_block_data_order_hw,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
sha256_block_data_order_hw:
@ K256 is too far to reference from one ADR command in Thumb mode. In
@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
@ boundary. For simplicity, just load the offset from .LK256_shortcut.
ldr $Ktbl,.LK256_shortcut
.LK256_add:
add $Ktbl,pc,$Ktbl
vld1.32 {$ABCD,$EFGH},[$ctx]
sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
@ -680,17 +677,13 @@ $code.=<<___;
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
open SELF,$0;

@ -276,33 +276,13 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
#endif
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
.Lsha512_block_data_order:
adr r3,.Lsha512_block_data_order
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
bne .LNEON
#endif
.global sha512_block_data_order_nohw
.type sha512_block_data_order_nohw,%function
sha512_block_data_order_nohw:
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
@ replace this with a simple ADR.
sub $Ktbl,r3,#672 @ K512
adr $Ktbl,K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@ -501,7 +481,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha512_block_data_order,.-sha512_block_data_order
.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
___
{
@ -612,7 +592,6 @@ $code.=<<___;
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
@ -650,10 +629,6 @@ ___
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

@ -26,7 +26,7 @@ extern "C" {
// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
// defined in assembly.
#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
#define SHA1_ASM
#define SHA256_ASM
@ -39,6 +39,35 @@ void sha256_block_data_order(uint32_t *state, const uint8_t *data,
void sha512_block_data_order(uint64_t *state, const uint8_t *data,
size_t num_blocks);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
#define SHA1_ASM_NOHW
#define SHA256_ASM_NOHW
#define SHA512_ASM_NOHW
#define SHA1_ASM_HW
OPENSSL_INLINE int sha1_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA1_capable();
}
#define SHA1_ASM_NEON
void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
size_t num);
#define SHA256_ASM_HW
OPENSSL_INLINE int sha256_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA256_capable();
}
#define SHA256_ASM_NEON
void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
size_t num);
// Armv8.2 SHA-512 instructions are not available in 32-bit.
#define SHA512_ASM_NEON
void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
size_t num);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
#define SHA1_ASM_NOHW
@ -148,6 +177,7 @@ void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
size_t num);
#endif
#if defined(SHA512_ASM_NOHW)
void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
size_t num);

@ -409,6 +409,12 @@ static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
sha1_block_data_order_ssse3(state, data, num);
return;
}
#endif
#if defined(SHA1_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha1_block_data_order_neon(state, data, num);
return;
}
#endif
sha1_block_data_order_nohw(state, data, num);
}

@ -331,6 +331,12 @@ static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
sha256_block_data_order_ssse3(state, data, num);
return;
}
#endif
#if defined(SHA256_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha256_block_data_order_neon(state, data, num);
return;
}
#endif
sha256_block_data_order_nohw(state, data, num);
}

@ -515,6 +515,12 @@ static void sha512_block_data_order(uint64_t *state, const uint8_t *data,
sha512_block_data_order_avx(state, data, num);
return;
}
#endif
#if defined(SHA512_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha512_block_data_order_neon(state, data, num);
return;
}
#endif
sha512_block_data_order_nohw(state, data, num);
}

@ -75,6 +75,11 @@ TEST(SHATest, SHA1ABI) {
return;
}
#endif
#if defined(SHA1_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_NOHW)
CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@ -107,6 +112,11 @@ TEST(SHATest, SHA256ABI) {
return;
}
#endif
#if defined(SHA256_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA256_ASM_NOHW)
CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@ -132,6 +142,11 @@ TEST(SHATest, SHA512ABI) {
CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA512_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA512_ASM_NOHW)
CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif

Loading…
Cancel
Save