boringssl/crypto/fipsmodule/bn/rsaz_exp.h

/*
 * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
 * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
 *
 * Licensed under the OpenSSL license (the "License").  You may not use
 * this file except in compliance with the License.  You can obtain a copy
 * in the file LICENSE in the source distribution or at
 * https://www.openssl.org/source/license.html
 *
 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
 * (1) Intel Corporation, Israel Development Center, Haifa, Israel
 * (2) University of Haifa, Israel
 */

#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
#define OPENSSL_HEADER_BN_RSAZ_EXP_H

#include <openssl/bn.h>

#include "internal.h"
#include "../../internal.h"

#if defined(__cplusplus)
extern "C" {
#endif

#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define RSAZ_ENABLED


// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH|
// bytes.
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
                            const BN_ULONG exponent[16],
                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
                            BN_ULONG k0,
                            BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);

OPENSSL_INLINE int rsaz_avx2_capable(void) {
  return CRYPTO_is_AVX2_capable();
}

OPENSSL_INLINE int rsaz_avx2_preferred(void) {
  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
      CRYPTO_is_ADX_capable()) {
    // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
    // .Lmulx4x_enter and .Lpowerx5_enter branches.
    return 0;
  }
  return CRYPTO_is_AVX2_capable();
}


// Assembly functions.

// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in
// 64-bit integers. This requires 36 limbs but padded up to 40.
//
// See crypto/bn/asm/rsaz-avx2.pl for further details.

// rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation
// and writes the result to |red|.
void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);

// rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|.
// Inputs and outputs are in Montgomery form, using RSAZ's representation. |k|
// is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
                        const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);

// rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to
// |ret|. Inputs and outputs are in Montgomery form, using RSAZ's
// representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
                        const BN_ULONG n[40], BN_ULONG k, int count);

// rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
// positive and at most 31. Note the table only uses 18 |BN_ULONG|s per entry
// instead of 40. It packs two 29-bit limbs into each |BN_ULONG| and only stores
// 36 limbs rather than the padded 40.
void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
                             int i);

// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|.
void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
                            int i);

// rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation
// and writes the result to |norm|. The result will be <= the modulus.
//
// WARNING: The result of this operation may not be fully reduced. |norm| may be
// the modulus instead of zero. This function should be followed by a call to
// |bn_reduce_once|.
void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);


#endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64

#if defined(__cplusplus)
}  // extern "C"
#endif

#endif  // OPENSSL_HEADER_BN_RSAZ_EXP_H
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago			`/*`
			`* Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.`
			`* Copyright (c) 2012, Intel Corporation. All Rights Reserved.`
			`*`
			`* Licensed under the OpenSSL license (the "License"). You may not use`
			`* this file except in compliance with the License. You can obtain a copy`
			`* in the file LICENSE in the source distribution or at`
			`* https://www.openssl.org/source/license.html`
			`*`
			`* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)`
			`* (1) Intel Corporation, Israel Development Center, Haifa, Israel`
			`* (2) University of Haifa, Israel`
			`*/`

			`#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H`
			`#define OPENSSL_HEADER_BN_RSAZ_EXP_H`

			`#include <openssl/bn.h>`

			`#include "internal.h"`
Move CPU detection symbols to crypto/internal.h. These symbols were not marked OPENSSL_EXPORT, so they weren't really usable externally anyway. They're also very sensitive to various build configuration toggles, which don't always get reflected into projects that include our headers. Move them to crypto/internal.h. Change-Id: I79a1fcf0b24e398d75a9cc6473bae28ec85cb835 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/50846 Reviewed-by: Adam Langley <agl@google.com> 3 years ago			`#include "../../internal.h"`
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago
			`#if defined(__cplusplus)`
			`extern "C" {`
			`#endif`

			`#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)`
			`#define RSAZ_ENABLED`


			`// RSAZ_1024_mod_exp_avx2 sets \|result\| to \|base_norm\| raised to \|exponent\|`
			`// modulo \|m_norm\|. \|base_norm\| must be fully-reduced and \|exponent\| must have`
			`// the high bit set (it is 1024 bits wide). \|RR\| and \|k0\| must be \|RR\| and \|n0\|,`
			`// respectively, extracted from \|m_norm\|'s \|BN_MONT_CTX\|. \|storage_words\| is a`
			`// temporary buffer that must be aligned to \|MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH\|`
			`// bytes.`
			`void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],`
			`const BN_ULONG exponent[16],`
			`const BN_ULONG m_norm[16], const BN_ULONG RR[16],`
			`BN_ULONG k0,`
			`BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);`

			`OPENSSL_INLINE int rsaz_avx2_capable(void) {`
Check static CPU capabilities on x86. On Arm, our CRYPTO_is_*_capable functions check the corresponding preprocessor symbol. This allows us to automatically drop dynamic checks and fallback code when some capability is always avilable. This CL does the same on x86, as well as consolidates our OPENSSL_ia32cap_P checks in one place. Since this abstraction is incompatible with some optimizations we do around OPENSSL_ia32cap_get() in the FIPS module, I've marked the symbol __attribute__((const)), which is enough to make GCC and Clang do the optimizations for us. (We already do the same to DEFINE_BSS_GET.) Most x86 platforms support a much wider range of capabilities, so this is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3 available, so this allows us to statically drop an AES implementation. (On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the bssl binary.) Configs like -march=native can also drop a bunch of code. Update-Note: This CL may break build environments that incorrectly mark some instruction as statically available. This is unlikely to happen with vector instructions like AVX, where the compiler could freely emit them anyway. However, instructions like AES-NI might be set incorrectly. Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548 Reviewed-by: Adam Langley <agl@google.com> 3 years ago			`return CRYPTO_is_AVX2_capable();`
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago			`}`

			`OPENSSL_INLINE int rsaz_avx2_preferred(void) {`
Check static CPU capabilities on x86. On Arm, our CRYPTO_is_*_capable functions check the corresponding preprocessor symbol. This allows us to automatically drop dynamic checks and fallback code when some capability is always avilable. This CL does the same on x86, as well as consolidates our OPENSSL_ia32cap_P checks in one place. Since this abstraction is incompatible with some optimizations we do around OPENSSL_ia32cap_get() in the FIPS module, I've marked the symbol __attribute__((const)), which is enough to make GCC and Clang do the optimizations for us. (We already do the same to DEFINE_BSS_GET.) Most x86 platforms support a much wider range of capabilities, so this is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3 available, so this allows us to statically drop an AES implementation. (On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the bssl binary.) Configs like -march=native can also drop a bunch of code. Update-Note: This CL may break build environments that incorrectly mark some instruction as statically available. This is unlikely to happen with vector instructions like AVX, where the compiler could freely emit them anyway. However, instructions like AES-NI might be set incorrectly. Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548 Reviewed-by: Adam Langley <agl@google.com> 3 years ago			`if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&`
			`CRYPTO_is_ADX_capable()) {`
Align rsaz_avx2_preferred with x86_64-mont5.pl. x86_64-mont5.pl checks for both BMI1 and BMI2, because the MULX path also uses the ANDN instruction. Some history here from upstream: a5bb5bca52f57021a4017521c55a6b3590bbba7a, dated 2013-10-03, added the MULX path to x86_64-mont5.pl. At the time, the cpuid check was BMI2+ADX. (MULX comes from BMI2.) 37de2b5c1e370b493932552556940eb89922b027, dated 2013-10-09, made BN_mod_exp_mont_consttime prefer the MULX mont5 code over the AVX2 rsaz code, with a matching BMI2+ADX cpuid check. 8fc8f486f7fa098c9fbb6a6ae399e3c6856e0d87, dated 2016-01-25, tweaked some code to use the ANDN instruction, from BMI1. Correspondingly, it changed the cpuid check to be BMI1+BMI2+ADX. The BN_mod_exp_mont_consttime check was left unchanged. This CL fixes our version of the BN_mod_exp_mont_consttime check to match the assembly, by also checking BMI1. (This should be a no-op. Presumably any processor with BMI2 also has BMI1.) Change-Id: Ib0cacc7e2be840d970460eef4dd9ded7fb24231c Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51547 Reviewed-by: Adam Langley <agl@google.com> 3 years ago			`// If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the`
			`// .Lmulx4x_enter and .Lpowerx5_enter branches.`
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago			`return 0;`
			`}`
Check static CPU capabilities on x86. On Arm, our CRYPTO_is_*_capable functions check the corresponding preprocessor symbol. This allows us to automatically drop dynamic checks and fallback code when some capability is always avilable. This CL does the same on x86, as well as consolidates our OPENSSL_ia32cap_P checks in one place. Since this abstraction is incompatible with some optimizations we do around OPENSSL_ia32cap_get() in the FIPS module, I've marked the symbol __attribute__((const)), which is enough to make GCC and Clang do the optimizations for us. (We already do the same to DEFINE_BSS_GET.) Most x86 platforms support a much wider range of capabilities, so this is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3 available, so this allows us to statically drop an AES implementation. (On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the bssl binary.) Configs like -march=native can also drop a bunch of code. Update-Note: This CL may break build environments that incorrectly mark some instruction as statically available. This is unlikely to happen with vector instructions like AVX, where the compiler could freely emit them anyway. However, instructions like AES-NI might be set incorrectly. Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548 Reviewed-by: Adam Langley <agl@google.com> 3 years ago			`return CRYPTO_is_AVX2_capable();`
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago			`}`


			`// Assembly functions.`

			`// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in`
			`// 64-bit integers. This requires 36 limbs but padded up to 40.`
			`//`
			`// See crypto/bn/asm/rsaz-avx2.pl for further details.`

			`// rsaz_1024_norm2red_avx2 converts \|norm\| from \|BIGNUM\| to RSAZ representation`
			`// and writes the result to \|red\|.`
			`void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);`

			`// rsaz_1024_mul_avx2 computes \|a\| * \|b\| mod \|n\| and writes the result to \|ret\|.`
			`// Inputs and outputs are in Montgomery form, using RSAZ's representation. \|k\|`
			`// is -\|n\|^-1 mod 2^64 or \|n0\| from \|BN_MONT_CTX\|.`
			`void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],`
			`const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);`

			`// rsaz_1024_mul_avx2 computes \|a\|^(2*\|count\|) mod \|n\| and writes the result to`
			`// \|ret\|. Inputs and outputs are in Montgomery form, using RSAZ's`
			`// representation. \|k\| is -\|n\|^-1 mod 2^64 or \|n0\| from \|BN_MONT_CTX\|.`
			`void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],`
			`const BN_ULONG n[40], BN_ULONG k, int count);`

			`// rsaz_1024_scatter5_avx2 stores \|val\| at index \|i\| of \|tbl\|. \|i\| must be`
			`// positive and at most 31. Note the table only uses 18 \|BN_ULONG\|s per entry`
			`// instead of 40. It packs two 29-bit limbs into each \|BN_ULONG\| and only stores`
			`// 36 limbs rather than the padded 40.`
			`void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],`
			`int i);`

			`// rsaz_1024_gather5_avx2 loads index \|i\| of \|tbl\| and writes it to \|val\|.`
			`void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],`
			`int i);`

			`// rsaz_1024_red2norm_avx2 converts \|red\| from RSAZ to \|BIGNUM\| representation`
Add an extra reduction step to the end of RSAZ. RSAZ has a very similar bug to mont5 from https://boringssl-review.googlesource.com/c/boringssl/+/52825 and may return the modulus when it should return zero. As in that CL, there is no security impact on our cryptographic primitives. RSAZ is described in the paper "Software Implementation of Modular Exponentiation, Using Advanced Vector Instructions Architectures". The bug comes from RSAZ's use of "NRMM" or "Non Reduced Montgomery Multiplication". This is like normal Montgomery multiplication, but skips the final subtraction altogether (whereas mont5's AMM still subtracts, but replaces MM's tigher bound with just the carry bit). This would normally not be stable, but RSAZ picks a larger R > 4M, and maintains looser bounds for modular arithmetic, a < 2M. Lemma 1 from the paper proves that NRMM(a, b) preserves this 2M bound. It also claims NRMM(a, 1) < M. That is, conversion out of Montgomery form with NRMM is fully reduced. This second claim is wrong. The proof shows that NRMM(a, 1) < 1/2 + M, which only implies NRMM(a, 1) <= M, not NRMM(a, 1) < M. RSAZ relies on this to produce a reduced output (see Figure 7 in the paper). Thus, like mont5 with AMM, RSAZ may return the modulus when it should return zero. Fix this by adding a bn_reduce_once_in_place call at the end of the operation. Change-Id: If28bc49ae8dfbfb43bea02af5ea10c4209a1c6e6 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52827 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: David Benjamin <davidben@google.com> 3 years ago			`// and writes the result to \|norm\|. The result will be <= the modulus.`
			`//`
			`// WARNING: The result of this operation may not be fully reduced. \|norm\| may be`
			`// the modulus instead of zero. This function should be followed by a call to`
			`// \|bn_reduce_once\|.`
acvp: add CMAC-AES support. Change by Dan Janni. Change-Id: I3f059e7b1a822c6f97128ca92a693499a3f7fa8f Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/41984 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com> 5 years ago			`void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);`


			`#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64`

			`#if defined(__cplusplus)`
			`} // extern "C"`
			`#endif`

			`#endif // OPENSSL_HEADER_BN_RSAZ_EXP_H`