sha: Move Armv7 dispatching to C

sha256_block_data_order_hw required a bit of wrestling with Arm immediate limits. PC-relative addressing in 32-bit Arm is a huge mess. I think I could have avoided the extra load with a lot of effort (convincing the assembler to evaluate a messy expression), but this is simpler and there was no measurable performance difference. Change-Id: I3fab4abc0fa24e0d689581e2c9b9faaa32bd7442 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64749 Commit-Queue: Bob Beck <bbe@google.com> Auto-Submit: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>
11 months ago · 62f43f5ea5
parent fcec1397a4
commit 62f43f5ea5
8 changed files with 105 additions and 97 deletions
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@ -197,24 +197,11 @@ $code=<<___;
 .code	32
 #endif

-.global	sha1_block_data_order
-.type	sha1_block_data_order,%function
+.global	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function

 .align	5
-sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-.Lsha1_block:
-	adr	r3,.Lsha1_block
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA1
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+sha1_block_data_order_nohw:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	ldmia	$ctx,{$a,$b,$c,$d,$e}
@ -304,17 +291,13 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha1_block_data_order,.-sha1_block_data_order
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw

 .align	5
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
 .LK_40_59:	.word	0x8f1bbcdc
 .LK_60_79:	.word	0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha1_block
-#endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 ___
@ -530,10 +513,10 @@ $code.=<<___;
 .arch	armv7-a
 .fpu	neon

+.global	sha1_block_data_order_neon
 .type	sha1_block_data_order_neon,%function
 .align	4
 sha1_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	@ dmb				@ errata #451034 on early Cortex A8
@ -625,10 +608,10 @@ $code.=<<___;
 #  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
 # endif

-.type	sha1_block_data_order_armv8,%function
+.global	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
 .align	5
-sha1_block_data_order_armv8:
-.LARMv8:
+sha1_block_data_order_hw:
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so

 	veor	$E,$E,$E
@ -693,16 +676,10 @@ $code.=<<___;

 	vldmia	sp!,{d8-d15}
 	ret					@ bx lr
-.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 #endif
 ___
 }}}
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-___

 {   my  %opcode = (
 	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@ -217,34 +217,15 @@ K256:
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha256_block_data_order
-#endif
 .align	5

-.global	sha256_block_data_order
-.type	sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-	adr	r3,.Lsha256_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA256
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#256+32	@ K256
+	adr	$Ktbl,K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH>=7
@ -298,7 +279,7 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_block_data_order,.-sha256_block_data_order
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
 ___
 ######################################################################
 # NEON stuff
@ -483,10 +464,12 @@ $code.=<<___;
 .align	5
 .skip	16
 sha256_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}

 	sub	$H,sp,#16*4+16
+	@ In Arm mode, the following ADR runs up against the limits of encodable
+	@ offsets. It only fits because the offset, when the ADR is placed here,
+	@ is a multiple of 16.
 	adr	$Ktbl,K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
@ -613,12 +596,26 @@ $code.=<<___;
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif

-.type	sha256_block_data_order_armv8,%function
+.LK256_shortcut:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add+4)
+#else
+.word	K256-(.LK256_add+8)
+#endif
+
+.global	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
 .align	5
-sha256_block_data_order_armv8:
-.LARMv8:
+sha256_block_data_order_hw:
+	@ K256 is too far to reference from one ADR command in Thumb mode. In
+	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+	@ boundary. For simplicity, just load the offset from .LK256_shortcut.
+	ldr	$Ktbl,.LK256_shortcut
+.LK256_add:
+	add	$Ktbl,pc,$Ktbl
+
 	vld1.32	{$ABCD,$EFGH},[$ctx]
-	sub	$Ktbl,$Ktbl,#256+32
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8

@ -680,17 +677,13 @@ $code.=<<___;
 	vst1.32		{$ABCD,$EFGH},[$ctx]

 	ret		@ bx lr
-.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 #endif
 ___
 }}}
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
 ___

 open SELF,$0;
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@ -276,33 +276,13 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha512_block_data_order
-.skip	32-4
-#else
-.skip	32
-#endif

-.global	sha512_block_data_order
-.type	sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-	adr	r3,.Lsha512_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4-r12,lr}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#672		@ K512
+	adr	$Ktbl,K512
 	sub	sp,sp,#9*8

 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@ -501,7 +481,7 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha512_block_data_order,.-sha512_block_data_order
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
 ___

 {
@ -612,7 +592,6 @@ $code.=<<___;
 .type	sha512_block_data_order_neon,%function
 .align	4
 sha512_block_data_order_neon:
-.LNEON:
 	dmb				@ errata #451034 on early Cortex A8
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	adr	$Ktbl,K512
@ -650,10 +629,6 @@ ___
 $code.=<<___;
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 ___

 $code =~ s/\`([^\`]*)\`/eval $1/gem;
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@ -26,7 +26,7 @@ extern "C" {
 // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
 // defined in assembly.

-#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)

 #define SHA1_ASM
 #define SHA256_ASM
@ -39,6 +39,35 @@ void sha256_block_data_order(uint32_t *state, const uint8_t *data,
 void sha512_block_data_order(uint64_t *state, const uint8_t *data,
                             size_t num_blocks);

+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                  size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
+                                  size_t num);
+
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)

 #define SHA1_ASM_NOHW
@ -148,6 +177,7 @@ void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
 void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
                                size_t num);
 #endif
+
 #if defined(SHA512_ASM_NOHW)
 void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
                                  size_t num);
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@ -409,6 +409,12 @@ static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
    sha1_block_data_order_ssse3(state, data, num);
    return;
  }
+#endif
+#if defined(SHA1_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha1_block_data_order_neon(state, data, num);
+    return;
+  }
 #endif
  sha1_block_data_order_nohw(state, data, num);
 }
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@ -331,6 +331,12 @@ static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
    sha256_block_data_order_ssse3(state, data, num);
    return;
  }
+#endif
+#if defined(SHA256_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha256_block_data_order_neon(state, data, num);
+    return;
+  }
 #endif
  sha256_block_data_order_nohw(state, data, num);
 }
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@ -515,6 +515,12 @@ static void sha512_block_data_order(uint64_t *state, const uint8_t *data,
    sha512_block_data_order_avx(state, data, num);
    return;
  }
+#endif
+#if defined(SHA512_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha512_block_data_order_neon(state, data, num);
+    return;
+  }
 #endif
  sha512_block_data_order_nohw(state, data, num);
 }
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@ -75,6 +75,11 @@ TEST(SHATest, SHA1ABI) {
      return;
    }
 #endif
+#if defined(SHA1_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA1_ASM_NOHW)
    CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@ -107,6 +112,11 @@ TEST(SHATest, SHA256ABI) {
      return;
    }
 #endif
+#if defined(SHA256_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA256_ASM_NOHW)
    CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@ -132,6 +142,11 @@ TEST(SHATest, SHA512ABI) {
      CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
    }
 #endif
+#if defined(SHA512_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA512_ASM_NOHW)
    CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif