Drop HRSS assembly.

While this assembly implementation is faster in microbenchmarks, the
cache pressure makes it slightly worse than the C code in larger
benchmarks.

Before:

Did 7686 HRSS generate operations in 1056025us (7278.2 ops/sec)
Did 90000 HRSS encap operations in 1010095us (89100.5 ops/sec)
Did 28000 HRSS decap operations in 1031008us (27157.9 ops/sec)

After:

Did 3523 HRSS generate operations in 1045508us (3369.7 ops/sec)
Did 43000 HRSS encap operations in 1017077us (42278.0 ops/sec)
Did 17000 HRSS decap operations in 1011170us (16812.2 ops/sec)

Change-Id: Ia7745b50393f2d2849867e7c5c0af59d651f243d
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/55885
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: Adam Langley <agl@google.com>
fips-20230428
Adam Langley 2 years ago committed by Boringssl LUCI CQ
parent dbbd79e895
commit 97873cd1a5
  1. 1
      crypto/CMakeLists.txt
  2. 8493
      crypto/hrss/asm/poly_rq_mul.S
  3. 12
      crypto/hrss/hrss.c
  4. 20
      crypto/hrss/internal.h

@ -149,7 +149,6 @@ if(ARCH STREQUAL "x86_64")
chacha/chacha-x86_64.${ASM_EXT}
cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT}
cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT}
hrss/asm/poly_rq_mul.S
test/trampoline-x86_64.${ASM_EXT}
)
endif()

File diff suppressed because it is too large Load Diff

@ -971,11 +971,6 @@ struct POLY_MUL_SCRATCH {
vec_t scratch[172];
} vec;
#endif
#if defined(POLY_RQ_MUL_ASM)
// This is the space used by |poly_Rq_mul|.
uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE];
#endif
} u;
};
@ -1326,13 +1321,6 @@ static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
const struct poly *a, const struct poly *b) {
#if defined(POLY_RQ_MUL_ASM)
if (CRYPTO_is_AVX2_capable()) {
poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
poly_normalize(r);
} else
#endif
#if defined(HRSS_HAVE_VECTOR_UNIT)
if (vec_capable()) {
poly_mul_vec(scratch, r, a, b);

@ -41,26 +41,6 @@ OPENSSL_EXPORT void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out,
const struct poly3 *in);
// On x86-64, we can use the AVX2 code from [HRSS]. (The authors have given
// explicit permission for this and signed a CLA.) However it's 57KB of object
// code, so it's not used if |OPENSSL_SMALL| is defined.
#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX)
#define POLY_RQ_MUL_ASM
// POLY_MUL_RQ_SCRATCH_SPACE is the number of bytes of scratch space needed
// by the assembly function poly_Rq_mul.
#define POLY_MUL_RQ_SCRATCH_SPACE (6144 + 6144 + 12288 + 512 + 9408 + 32)
// poly_Rq_mul is defined in assembly. Inputs and outputs must be 16-byte-
// aligned.
extern void poly_Rq_mul(
uint16_t r[N + 3], const uint16_t a[N + 3], const uint16_t b[N + 3],
// The following should be `scratch[POLY_MUL_RQ_SCRATCH_SPACE]` but
// GCC 11.1 has a bug with unions that breaks that.
uint8_t scratch[]);
#endif
#if defined(__cplusplus)
} // extern "C"
#endif

Loading…
Cancel
Save