Simplify the Lucky13 mitigation.

Rather than computing kVarianceBlocks, which is hard to reason about,
use a sha1_final_with_secret_suffix abstraction. This lets us separate
reasoning in bytes about the minimum and maximum values of |data_size|
and the interaction with HMAC, separately from the core constant-time
SHA-1 update.

It's also faster. I'm guessing it's the more accurate block counts.

Before:
Did 866000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000697us (6.9 MB/sec)
Did 616000 AES-128-CBC-SHA1 (256 bytes) open operations in 2001403us (78.8 MB/sec)
Did 432000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2003898us (291.0 MB/sec)
Did 148000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2006042us (604.4 MB/sec)
Did 83000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2010885us (676.3 MB/sec)

After:
Did 2089000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000049us (16.7 MB/sec) [+141.3%]
Did 851000 AES-128-CBC-SHA1 (256 bytes) open operations in 2000034us (108.9 MB/sec) [+38.2%]
Did 553000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2002169us (372.9 MB/sec) [+28.1%]
Did 178000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2008596us (726.0 MB/sec) [+20.1%]
Did 98000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2001509us (802.2 MB/sec) [+18.6%]

Confirmed with valgrind tooling that this is still constant-time. In
doing so, I ran into a new nuisance with GCC. In loops where we run
constant_time_lt with a counter value, GCC sometimes offsets the loop
counter by the secret. It cancels it out before dereferencing memory,
etc., but valgrind does not know that x + uninit - uninit = x and gets
upset. I've worked around this with a barrier for now.

Change-Id: Ieff8d2cad1b56c07999002e67ce4e6d6aa59e0d3
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/46686
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
grpc-202302
David Benjamin 4 years ago committed by CQ bot account: commit-bot@chromium.org
parent 00e434d67e
commit 669ffe64a4
  1. 50
      crypto/cipher_extra/cipher_test.cc
  2. 2
      crypto/cipher_extra/e_tls.c
  3. 17
      crypto/cipher_extra/internal.h
  4. 319
      crypto/cipher_extra/tls_cbc.c

@ -65,11 +65,14 @@
#include <openssl/cipher.h>
#include <openssl/err.h>
#include <openssl/nid.h>
#include <openssl/rand.h>
#include <openssl/sha.h>
#include <openssl/span.h>
#include "../test/file_test.h"
#include "../test/test_util.h"
#include "../test/wycheproof_util.h"
#include "./internal.h"
static const EVP_CIPHER *GetCipher(const std::string &name) {
@ -474,3 +477,50 @@ TEST(CipherTest, WycheproofAESCBC) {
}
});
}
TEST(CipherTest, SHA1WithSecretSuffix) {
uint8_t buf[SHA_CBLOCK * 4];
RAND_bytes(buf, sizeof(buf));
// Hashing should run in time independent of the bytes.
CONSTTIME_SECRET(buf, sizeof(buf));
// Exhaustively testing interesting cases in this function is cubic in the
// block size, so we test in 3-byte increments.
constexpr size_t kSkip = 3;
// This value should be less than 8 to test the edge case when the 8-byte
// length wraps to the next block.
static_assert(kSkip < 8, "kSkip is too large");
// |EVP_sha1_final_with_secret_suffix| is sensitive to the public length of
// the partial block previously hashed. In TLS, this is the HMAC prefix, the
// header, and the public minimum padding length.
for (size_t prefix = 0; prefix < SHA_CBLOCK; prefix += kSkip) {
SCOPED_TRACE(prefix);
// The first block is treated differently, so we run with up to three
// blocks of length variability.
for (size_t max_len = 0; max_len < 3 * SHA_CBLOCK; max_len += kSkip) {
SCOPED_TRACE(max_len);
for (size_t len = 0; len <= max_len; len += kSkip) {
SCOPED_TRACE(len);
uint8_t expected[SHA_DIGEST_LENGTH];
SHA1(buf, prefix + len, expected);
CONSTTIME_DECLASSIFY(expected, sizeof(expected));
// Make a copy of the secret length to avoid interfering with the loop.
size_t secret_len = len;
CONSTTIME_SECRET(&secret_len, sizeof(secret_len));
SHA_CTX ctx;
SHA1_Init(&ctx);
SHA1_Update(&ctx, buf, prefix);
uint8_t computed[SHA_DIGEST_LENGTH];
ASSERT_TRUE(EVP_sha1_final_with_secret_suffix(
&ctx, computed, buf + prefix, secret_len, max_len));
CONSTTIME_DECLASSIFY(computed, sizeof(computed));
EXPECT_EQ(Bytes(expected), Bytes(computed));
}
}
}
}

@ -343,7 +343,7 @@ static int aead_tls_open(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE &&
EVP_tls_cbc_record_digest_supported(tls_ctx->hmac_ctx.md)) {
if (!EVP_tls_cbc_digest_record(tls_ctx->hmac_ctx.md, mac, &mac_len,
ad_fixed, out, data_plus_mac_len, total,
ad_fixed, out, data_len, total,
tls_ctx->mac_key, tls_ctx->mac_key_len)) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;

@ -99,6 +99,17 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in,
// which EVP_tls_cbc_digest_record supports.
int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);
// EVP_sha1_final_with_secret_suffix computes the result of hashing |len| bytes
// from |in| to |ctx| and writes the resulting hash to |out|. |len| is treated
// as secret and must be at most |max_len|, which is treated as public. |in|
// must point to a buffer of at least |max_len| bytes. It returns one on success
// and zero if inputs are too long.
//
// This function is exported for unit tests.
OPENSSL_EXPORT int EVP_sha1_final_with_secret_suffix(
SHA_CTX *ctx, uint8_t out[SHA_DIGEST_LENGTH], const uint8_t *in, size_t len,
size_t max_len);
// EVP_tls_cbc_digest_record computes the MAC of a decrypted, padded TLS
// record.
//
@ -108,8 +119,8 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);
// md_out_size: the number of output bytes is written here.
// header: the 13-byte, TLS record header.
// data: the record data itself
// data_plus_mac_size: the secret, reported length of the data and MAC
// once the padding has been removed.
// data_size: the secret, reported length of the data once the padding and MAC
// have been removed.
// data_plus_mac_plus_padding_size: the public length of the whole
// record, including padding.
//
@ -119,7 +130,7 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);
// padding too. )
int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
size_t *md_out_size, const uint8_t header[13],
const uint8_t *data, size_t data_plus_mac_size,
const uint8_t *data, size_t data_size,
size_t data_plus_mac_plus_padding_size,
const uint8_t *mac_secret,
unsigned mac_secret_length);

@ -62,15 +62,6 @@
#include "../fipsmodule/cipher/internal.h"
// MAX_HASH_BIT_COUNT_BYTES is the maximum number of bytes in the hash's length
// field. (SHA-384/512 have 128-bit length.)
#define MAX_HASH_BIT_COUNT_BYTES 16
// MAX_HASH_BLOCK_SIZE is the maximum hash block size that we'll support.
// Currently SHA-384/512 has a 128-byte block size and that's the largest
// supported by TLS.)
#define MAX_HASH_BLOCK_SIZE 128
int EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, size_t *out_len,
const uint8_t *in, size_t in_len,
size_t block_size, size_t mac_size) {
@ -183,25 +174,97 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in,
OPENSSL_memcpy(out, rotated_mac, md_size);
}
// u32toBE serialises an unsigned, 32-bit number (n) as four bytes at (p) in
// big-endian order. The value of p is advanced by four.
#define u32toBE(n, p) \
do { \
*((p)++) = (uint8_t)((n) >> 24); \
*((p)++) = (uint8_t)((n) >> 16); \
*((p)++) = (uint8_t)((n) >> 8); \
*((p)++) = (uint8_t)((n)); \
} while (0)
// These functions serialize the state of a hash and thus perform the standard
// "final" operation without adding the padding and length that such a function
// typically does.
static void tls1_sha1_final_raw(SHA_CTX *ctx, uint8_t *md_out) {
u32toBE(ctx->h[0], md_out);
u32toBE(ctx->h[1], md_out);
u32toBE(ctx->h[2], md_out);
u32toBE(ctx->h[3], md_out);
u32toBE(ctx->h[4], md_out);
int EVP_sha1_final_with_secret_suffix(SHA_CTX *ctx,
uint8_t out[SHA_DIGEST_LENGTH],
const uint8_t *in, size_t len,
size_t max_len) {
// Bound the input length so |total_bits| below fits in four bytes. This is
// redundant with TLS record size limits. This also ensures |input_idx| below
// does not overflow.
size_t max_len_bits = max_len << 3;
if (ctx->Nh != 0 ||
(max_len_bits >> 3) != max_len || // Overflow
ctx->Nl + max_len_bits < max_len_bits ||
ctx->Nl + max_len_bits > UINT32_MAX) {
return 0;
}
// We need to hash the following into |ctx|:
//
// - ctx->data[:ctx->num]
// - in[:len]
// - A 0x80 byte
// - However many zero bytes are needed to pad up to a block.
// - Eight bytes of length.
size_t num_blocks = (ctx->num + len + 1 + 8 + SHA_CBLOCK - 1) >> 6;
size_t last_block = num_blocks - 1;
size_t max_blocks = (ctx->num + max_len + 1 + 8 + SHA_CBLOCK - 1) >> 6;
// The bounds above imply |total_bits| fits in four bytes.
size_t total_bits = ctx->Nl + (len << 3);
uint8_t length_bytes[4];
length_bytes[0] = (uint8_t)(total_bits >> 24);
length_bytes[1] = (uint8_t)(total_bits >> 16);
length_bytes[2] = (uint8_t)(total_bits >> 8);
length_bytes[3] = (uint8_t)total_bits;
// We now construct and process each expected block in constant-time.
uint8_t block[SHA_CBLOCK] = {0};
uint32_t result[5] = {0};
// input_idx is the index into |in| corresponding to the current block.
// However, we allow this index to overflow beyond |max_len|, to simplify the
// 0x80 byte.
size_t input_idx = 0;
for (size_t i = 0; i < max_blocks; i++) {
// Fill |block| with data from the partial block in |ctx| and |in|. We copy
// as if we were hashing up to |max_len| and then zero the excess later.
size_t block_start = 0;
if (i == 0) {
OPENSSL_memcpy(block, ctx->data, ctx->num);
block_start = ctx->num;
}
if (input_idx < max_len) {
size_t to_copy = SHA_CBLOCK - block_start;
if (to_copy > max_len - input_idx) {
to_copy = max_len - input_idx;
}
OPENSSL_memcpy(block + block_start, in + input_idx, to_copy);
}
// Zero any bytes beyond |len| and add the 0x80 byte.
for (size_t j = block_start; j < SHA_CBLOCK; j++) {
// input[idx] corresponds to block[j].
size_t idx = input_idx + j - block_start;
// The barriers on |len| are not strictly necessary. However, without
// them, GCC compiles this code by incorporating |len| into the loop
// counter and subtracting it out later. This is still constant-time, but
// it frustrates attempts to validate this.
uint8_t is_in_bounds = constant_time_lt_8(idx, value_barrier_w(len));
uint8_t is_padding_byte = constant_time_eq_8(idx, value_barrier_w(len));
block[j] &= is_in_bounds;
block[j] |= 0x80 & is_padding_byte;
}
input_idx += SHA_CBLOCK - block_start;
// Fill in the length if this is the last block.
crypto_word_t is_last_block = constant_time_eq_w(i, last_block);
for (size_t j = 0; j < 4; j++) {
block[SHA_CBLOCK - 4 + j] |= is_last_block & length_bytes[j];
}
// Process the block and save the hash state if it is the final value.
SHA1_Transform(ctx, block);
for (size_t j = 0; j < 5; j++) {
result[j] |= is_last_block & ctx->h[j];
}
}
// Write the output.
for (size_t i = 0; i < 5; i++) {
CRYPTO_store_u32_be(out + 4 * i, result[i]);
}
return 1;
}
int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) {
@ -210,18 +273,10 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) {
int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
size_t *md_out_size, const uint8_t header[13],
const uint8_t *data, size_t data_plus_mac_size,
const uint8_t *data, size_t data_size,
size_t data_plus_mac_plus_padding_size,
const uint8_t *mac_secret,
unsigned mac_secret_length) {
// Bound the acceptable input so we can forget about many possible overflows
// later in this function. This is redundant with the record size limits in
// TLS.
if (data_plus_mac_plus_padding_size >= 1024 * 1024) {
assert(0);
return 0;
}
if (EVP_MD_type(md) != NID_sha1) {
// EVP_tls_cbc_record_digest_supported should have been called first to
// check that the hash function is supported.
@ -230,178 +285,54 @@ int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
return 0;
}
// TODO(davidben): Further simplify this logic, now that we're down to one
// hash function.
SHA_CTX md_state;
SHA1_Init(&md_state);
unsigned md_size = SHA_DIGEST_LENGTH, md_block_size = 64, md_block_shift = 6;
// md_length_size is the number of bytes in the length field that terminates
// the hash.
unsigned md_length_size = 8;
assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
assert(md_block_size == (1u << md_block_shift));
assert(md_size <= EVP_MAX_MD_SIZE);
static const size_t kHeaderLength = 13;
// kVarianceBlocks is the number of blocks of the hash that we have to
// calculate in constant time because they could be altered by the
// padding value.
//
// TLSv1 has MACs up to 48 bytes long (SHA-384) and the padding is not
// required to be minimal. Therefore we say that the final |kVarianceBlocks|
// blocks can vary based on the padding and on the hash used. This value
// must be derived from public information.
const size_t kVarianceBlocks =
( 255 + 1 + // maximum padding bytes + padding length
md_size + // length of hash's output
md_block_size - 1 // ceiling
) / md_block_size
+ 1; // the 0x80 marker and the encoded message length could or not
// require an extra block; since the exact value depends on the
// message length; thus, one extra block is always added to run
// in constant time.
// From now on we're dealing with the MAC, which conceptually has 13
// bytes of `header' before the start of the data.
size_t len = data_plus_mac_plus_padding_size + kHeaderLength;
// max_mac_bytes contains the maximum bytes of bytes in the MAC, including
// |header|, assuming that there's no padding.
size_t max_mac_bytes = len - md_size - 1;
// num_blocks is the maximum number of hash blocks.
size_t num_blocks =
(max_mac_bytes + 1 + md_length_size + md_block_size - 1) / md_block_size;
// In order to calculate the MAC in constant time we have to handle
// the final blocks specially because the padding value could cause the
// end to appear somewhere in the final |kVarianceBlocks| blocks and we
// can't leak where. However, |num_starting_blocks| worth of data can
// be hashed right away because no padding value can affect whether
// they are plaintext.
size_t num_starting_blocks = 0;
// k is the starting byte offset into the conceptual header||data where
// we start processing.
size_t k = 0;
// mac_end_offset is the index just past the end of the data to be MACed.
size_t mac_end_offset = data_plus_mac_size + kHeaderLength - md_size;
// c is the index of the 0x80 byte in the final hash block that contains
// application data.
size_t c = mac_end_offset & (md_block_size - 1);
// index_a is the hash block number that contains the 0x80 terminating value.
size_t index_a = mac_end_offset >> md_block_shift;
// index_b is the hash block number that contains the 64-bit hash length, in
// bits.
size_t index_b = (mac_end_offset + md_length_size) >> md_block_shift;
if (num_blocks > kVarianceBlocks) {
num_starting_blocks = num_blocks - kVarianceBlocks;
k = md_block_size * num_starting_blocks;
if (mac_secret_length > SHA_CBLOCK) {
// HMAC pads small keys with zeros and hashes large keys down. This function
// should never reach the large key case.
assert(0);
return 0;
}
// bits is the hash-length in bits. It includes the additional hash
// block for the masked HMAC key.
size_t bits = 8 * mac_end_offset; // at most 18 bits to represent
// Compute the initial HMAC block.
bits += 8 * md_block_size;
// hmac_pad is the masked HMAC key.
uint8_t hmac_pad[MAX_HASH_BLOCK_SIZE];
OPENSSL_memset(hmac_pad, 0, md_block_size);
assert(mac_secret_length <= sizeof(hmac_pad));
uint8_t hmac_pad[SHA_CBLOCK];
OPENSSL_memset(hmac_pad, 0, sizeof(hmac_pad));
OPENSSL_memcpy(hmac_pad, mac_secret, mac_secret_length);
for (size_t i = 0; i < md_block_size; i++) {
for (size_t i = 0; i < SHA_CBLOCK; i++) {
hmac_pad[i] ^= 0x36;
}
SHA1_Transform(&md_state, hmac_pad);
// The length check means |bits| fits in four bytes.
uint8_t length_bytes[MAX_HASH_BIT_COUNT_BYTES];
OPENSSL_memset(length_bytes, 0, md_length_size - 4);
length_bytes[md_length_size - 4] = (uint8_t)(bits >> 24);
length_bytes[md_length_size - 3] = (uint8_t)(bits >> 16);
length_bytes[md_length_size - 2] = (uint8_t)(bits >> 8);
length_bytes[md_length_size - 1] = (uint8_t)bits;
if (k > 0) {
// k is a multiple of md_block_size.
uint8_t first_block[MAX_HASH_BLOCK_SIZE];
OPENSSL_memcpy(first_block, header, 13);
OPENSSL_memcpy(first_block + 13, data, md_block_size - 13);
SHA1_Transform(&md_state, first_block);
for (size_t i = 1; i < k / md_block_size; i++) {
SHA1_Transform(&md_state, data + md_block_size * i - 13);
}
}
SHA_CTX ctx;
SHA1_Init(&ctx);
SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK);
SHA1_Update(&ctx, header, 13);
uint8_t mac_out[EVP_MAX_MD_SIZE];
OPENSSL_memset(mac_out, 0, sizeof(mac_out));
// We now process the final hash blocks. For each block, we construct
// it in constant time. If the |i==index_a| then we'll include the 0x80
// bytes and zero pad etc. For each block we selectively copy it, in
// constant time, to |mac_out|.
for (size_t i = num_starting_blocks;
i <= num_starting_blocks + kVarianceBlocks; i++) {
uint8_t block[MAX_HASH_BLOCK_SIZE];
uint8_t is_block_a = constant_time_eq_8(i, index_a);
uint8_t is_block_b = constant_time_eq_8(i, index_b);
for (size_t j = 0; j < md_block_size; j++) {
uint8_t b = 0;
if (k < kHeaderLength) {
b = header[k];
} else if (k < data_plus_mac_plus_padding_size + kHeaderLength) {
b = data[k - kHeaderLength];
}
k++;
uint8_t is_past_c = is_block_a & constant_time_ge_8(j, c);
uint8_t is_past_cp1 = is_block_a & constant_time_ge_8(j, c + 1);
// If this is the block containing the end of the
// application data, and we are at the offset for the
// 0x80 value, then overwrite b with 0x80.
b = constant_time_select_8(is_past_c, 0x80, b);
// If this the the block containing the end of the
// application data and we're past the 0x80 value then
// just write zero.
b = b & ~is_past_cp1;
// If this is index_b (the final block), but not
// index_a (the end of the data), then the 64-bit
// length didn't fit into index_a and we're having to
// add an extra block of zeros.
b &= ~is_block_b | is_block_a;
// The final bytes of one of the blocks contains the
// length.
if (j >= md_block_size - md_length_size) {
// If this is index_b, write a length byte.
b = constant_time_select_8(
is_block_b, length_bytes[j - (md_block_size - md_length_size)], b);
}
block[j] = b;
// There are at most 256 bytes of padding, so we can compute the public
// minimum length for |data_size|.
size_t min_data_size = 0;
if (data_plus_mac_plus_padding_size > SHA_DIGEST_LENGTH + 256) {
min_data_size = data_plus_mac_plus_padding_size - SHA_DIGEST_LENGTH - 256;
}
SHA1_Transform(&md_state, block);
tls1_sha1_final_raw(&md_state, block);
// If this is index_b, copy the hash value to |mac_out|.
for (size_t j = 0; j < md_size; j++) {
mac_out[j] |= block[j] & is_block_b;
}
}
// Hash the public minimum length directly. This reduces the number of blocks
// that must be computed in constant-time.
SHA1_Update(&ctx, data, min_data_size);
SHA_CTX md_ctx;
SHA1_Init(&md_ctx);
// Hash the remaining data without leaking |data_size|.
uint8_t mac_out[SHA_DIGEST_LENGTH];
if (!EVP_sha1_final_with_secret_suffix(
&ctx, mac_out, data + min_data_size, data_size - min_data_size,
data_plus_mac_plus_padding_size - min_data_size)) {
return 0;
}
// Complete the HMAC in the standard manner.
for (size_t i = 0; i < md_block_size; i++) {
SHA1_Init(&ctx);
for (size_t i = 0; i < SHA_CBLOCK; i++) {
hmac_pad[i] ^= 0x6a;
}
SHA1_Update(&md_ctx, hmac_pad, md_block_size);
SHA1_Update(&md_ctx, mac_out, md_size);
SHA1_Final(md_out, &md_ctx);
*md_out_size = md_size;
SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK);
SHA1_Update(&ctx, mac_out, SHA_DIGEST_LENGTH);
SHA1_Final(md_out, &ctx);
*md_out_size = SHA_DIGEST_LENGTH;
return 1;
}

Loading…
Cancel
Save