|
|
|
/*
|
|
|
|
* Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
|
|
* Copyright (c) 2012, Intel Corporation. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Licensed under the OpenSSL license (the "License"). You may not use
|
|
|
|
* this file except in compliance with the License. You can obtain a copy
|
|
|
|
* in the file LICENSE in the source distribution or at
|
|
|
|
* https://www.openssl.org/source/license.html
|
|
|
|
*
|
|
|
|
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
|
|
|
|
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
|
|
|
|
* (2) University of Haifa, Israel
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
|
|
|
|
#define OPENSSL_HEADER_BN_RSAZ_EXP_H
|
|
|
|
|
|
|
|
#include <openssl/bn.h>
|
|
|
|
|
|
|
|
#include "internal.h"
|
|
|
|
#include "../../internal.h"
|
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
|
|
|
|
#define RSAZ_ENABLED
|
|
|
|
|
|
|
|
|
|
|
|
// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
|
|
|
|
// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
|
|
|
|
// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
|
|
|
|
// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
|
|
|
|
// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH|
|
|
|
|
// bytes.
|
|
|
|
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
|
|
|
|
const BN_ULONG exponent[16],
|
|
|
|
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
|
|
|
|
BN_ULONG k0,
|
|
|
|
BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
|
|
|
|
|
|
|
|
OPENSSL_INLINE int rsaz_avx2_capable(void) {
|
Check static CPU capabilities on x86.
On Arm, our CRYPTO_is_*_capable functions check the corresponding
preprocessor symbol. This allows us to automatically drop dynamic checks
and fallback code when some capability is always avilable.
This CL does the same on x86, as well as consolidates our
OPENSSL_ia32cap_P checks in one place. Since this abstraction is
incompatible with some optimizations we do around OPENSSL_ia32cap_get()
in the FIPS module, I've marked the symbol __attribute__((const)), which
is enough to make GCC and Clang do the optimizations for us. (We already
do the same to DEFINE_BSS_GET.)
Most x86 platforms support a much wider range of capabilities, so this
is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3
available, so this allows us to statically drop an AES implementation.
(On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the
bssl binary.) Configs like -march=native can also drop a bunch of code.
Update-Note: This CL may break build environments that incorrectly mark
some instruction as statically available. This is unlikely to happen
with vector instructions like AVX, where the compiler could freely emit
them anyway. However, instructions like AES-NI might be set incorrectly.
Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548
Reviewed-by: Adam Langley <agl@google.com>
3 years ago
|
|
|
return CRYPTO_is_AVX2_capable();
|
|
|
|
}
|
|
|
|
|
|
|
|
OPENSSL_INLINE int rsaz_avx2_preferred(void) {
|
Check static CPU capabilities on x86.
On Arm, our CRYPTO_is_*_capable functions check the corresponding
preprocessor symbol. This allows us to automatically drop dynamic checks
and fallback code when some capability is always avilable.
This CL does the same on x86, as well as consolidates our
OPENSSL_ia32cap_P checks in one place. Since this abstraction is
incompatible with some optimizations we do around OPENSSL_ia32cap_get()
in the FIPS module, I've marked the symbol __attribute__((const)), which
is enough to make GCC and Clang do the optimizations for us. (We already
do the same to DEFINE_BSS_GET.)
Most x86 platforms support a much wider range of capabilities, so this
is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3
available, so this allows us to statically drop an AES implementation.
(On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the
bssl binary.) Configs like -march=native can also drop a bunch of code.
Update-Note: This CL may break build environments that incorrectly mark
some instruction as statically available. This is unlikely to happen
with vector instructions like AVX, where the compiler could freely emit
them anyway. However, instructions like AES-NI might be set incorrectly.
Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548
Reviewed-by: Adam Langley <agl@google.com>
3 years ago
|
|
|
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
|
|
|
|
CRYPTO_is_ADX_capable()) {
|
Align rsaz_avx2_preferred with x86_64-mont5.pl.
x86_64-mont5.pl checks for both BMI1 and BMI2, because the MULX path
also uses the ANDN instruction. Some history here from upstream:
a5bb5bca52f57021a4017521c55a6b3590bbba7a, dated 2013-10-03, added the
MULX path to x86_64-mont5.pl. At the time, the cpuid check was
BMI2+ADX. (MULX comes from BMI2.)
37de2b5c1e370b493932552556940eb89922b027, dated 2013-10-09, made
BN_mod_exp_mont_consttime prefer the MULX mont5 code over the AVX2 rsaz
code, with a matching BMI2+ADX cpuid check.
8fc8f486f7fa098c9fbb6a6ae399e3c6856e0d87, dated 2016-01-25, tweaked some
code to use the ANDN instruction, from BMI1. Correspondingly, it changed
the cpuid check to be BMI1+BMI2+ADX. The BN_mod_exp_mont_consttime check
was left unchanged.
This CL fixes our version of the BN_mod_exp_mont_consttime check to
match the assembly, by also checking BMI1. (This should be a no-op.
Presumably any processor with BMI2 also has BMI1.)
Change-Id: Ib0cacc7e2be840d970460eef4dd9ded7fb24231c
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51547
Reviewed-by: Adam Langley <agl@google.com>
3 years ago
|
|
|
// If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
|
|
|
|
// .Lmulx4x_enter and .Lpowerx5_enter branches.
|
|
|
|
return 0;
|
|
|
|
}
|
Check static CPU capabilities on x86.
On Arm, our CRYPTO_is_*_capable functions check the corresponding
preprocessor symbol. This allows us to automatically drop dynamic checks
and fallback code when some capability is always avilable.
This CL does the same on x86, as well as consolidates our
OPENSSL_ia32cap_P checks in one place. Since this abstraction is
incompatible with some optimizations we do around OPENSSL_ia32cap_get()
in the FIPS module, I've marked the symbol __attribute__((const)), which
is enough to make GCC and Clang do the optimizations for us. (We already
do the same to DEFINE_BSS_GET.)
Most x86 platforms support a much wider range of capabilities, so this
is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3
available, so this allows us to statically drop an AES implementation.
(On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the
bssl binary.) Configs like -march=native can also drop a bunch of code.
Update-Note: This CL may break build environments that incorrectly mark
some instruction as statically available. This is unlikely to happen
with vector instructions like AVX, where the compiler could freely emit
them anyway. However, instructions like AES-NI might be set incorrectly.
Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548
Reviewed-by: Adam Langley <agl@google.com>
3 years ago
|
|
|
return CRYPTO_is_AVX2_capable();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Assembly functions.
|
|
|
|
|
|
|
|
// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in
|
|
|
|
// 64-bit integers. This requires 36 limbs but padded up to 40.
|
|
|
|
//
|
|
|
|
// See crypto/bn/asm/rsaz-avx2.pl for further details.
|
|
|
|
|
|
|
|
// rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation
|
|
|
|
// and writes the result to |red|.
|
|
|
|
void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);
|
|
|
|
|
|
|
|
// rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|.
|
|
|
|
// Inputs and outputs are in Montgomery form, using RSAZ's representation. |k|
|
|
|
|
// is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
|
|
|
|
void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
|
|
|
|
const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);
|
|
|
|
|
|
|
|
// rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to
|
|
|
|
// |ret|. Inputs and outputs are in Montgomery form, using RSAZ's
|
|
|
|
// representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
|
|
|
|
void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
|
|
|
|
const BN_ULONG n[40], BN_ULONG k, int count);
|
|
|
|
|
|
|
|
// rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
|
|
|
|
// positive and at most 31. Note the table only uses 18 |BN_ULONG|s per entry
|
|
|
|
// instead of 40. It packs two 29-bit limbs into each |BN_ULONG| and only stores
|
|
|
|
// 36 limbs rather than the padded 40.
|
|
|
|
void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
|
|
|
|
int i);
|
|
|
|
|
|
|
|
// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|.
|
|
|
|
void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
|
|
|
|
int i);
|
|
|
|
|
|
|
|
// rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation
|
Add an extra reduction step to the end of RSAZ.
RSAZ has a very similar bug to mont5 from
https://boringssl-review.googlesource.com/c/boringssl/+/52825 and may
return the modulus when it should return zero. As in that CL, there is
no security impact on our cryptographic primitives.
RSAZ is described in the paper "Software Implementation of Modular
Exponentiation, Using Advanced Vector Instructions Architectures".
The bug comes from RSAZ's use of "NRMM" or "Non Reduced Montgomery
Multiplication". This is like normal Montgomery multiplication, but
skips the final subtraction altogether (whereas mont5's AMM still
subtracts, but replaces MM's tigher bound with just the carry bit). This
would normally not be stable, but RSAZ picks a larger R > 4M, and
maintains looser bounds for modular arithmetic, a < 2M.
Lemma 1 from the paper proves that NRMM(a, b) preserves this 2M bound.
It also claims NRMM(a, 1) < M. That is, conversion out of Montgomery
form with NRMM is fully reduced. This second claim is wrong. The proof
shows that NRMM(a, 1) < 1/2 + M, which only implies NRMM(a, 1) <= M, not
NRMM(a, 1) < M. RSAZ relies on this to produce a reduced output (see
Figure 7 in the paper).
Thus, like mont5 with AMM, RSAZ may return the modulus when it should
return zero. Fix this by adding a bn_reduce_once_in_place call at the
end of the operation.
Change-Id: If28bc49ae8dfbfb43bea02af5ea10c4209a1c6e6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52827
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
3 years ago
|
|
|
// and writes the result to |norm|. The result will be <= the modulus.
|
|
|
|
//
|
|
|
|
// WARNING: The result of this operation may not be fully reduced. |norm| may be
|
|
|
|
// the modulus instead of zero. This function should be followed by a call to
|
|
|
|
// |bn_reduce_once|.
|
|
|
|
void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);
|
|
|
|
|
|
|
|
|
|
|
|
#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64
|
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
} // extern "C"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif // OPENSSL_HEADER_BN_RSAZ_EXP_H
|