Simplify the Lucky13 mitigation.

Rather than computing kVarianceBlocks, which is hard to reason about, use a sha1_final_with_secret_suffix abstraction. This lets us separate reasoning in bytes about the minimum and maximum values of |data_size| and the interaction with HMAC, separately from the core constant-time SHA-1 update. It's also faster. I'm guessing it's the more accurate block counts. Before: Did 866000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000697us (6.9 MB/sec) Did 616000 AES-128-CBC-SHA1 (256 bytes) open operations in 2001403us (78.8 MB/sec) Did 432000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2003898us (291.0 MB/sec) Did 148000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2006042us (604.4 MB/sec) Did 83000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2010885us (676.3 MB/sec) After: Did 2089000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000049us (16.7 MB/sec) [+141.3%] Did 851000 AES-128-CBC-SHA1 (256 bytes) open operations in 2000034us (108.9 MB/sec) [+38.2%] Did 553000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2002169us (372.9 MB/sec) [+28.1%] Did 178000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2008596us (726.0 MB/sec) [+20.1%] Did 98000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2001509us (802.2 MB/sec) [+18.6%] Confirmed with valgrind tooling that this is still constant-time. In doing so, I ran into a new nuisance with GCC. In loops where we run constant_time_lt with a counter value, GCC sometimes offsets the loop counter by the secret. It cancels it out before dereferencing memory, etc., but valgrind does not know that x + uninit - uninit = x and gets upset. I've worked around this with a barrier for now. Change-Id: Ieff8d2cad1b56c07999002e67ce4e6d6aa59e0d3 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/46686 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: David Benjamin <davidben@google.com>
4 years ago · 669ffe64a4
parent 00e434d67e
commit 669ffe64a4
4 changed files with 190 additions and 198 deletions
--- a/crypto/cipher_extra/cipher_test.cc
+++ b/crypto/cipher_extra/cipher_test.cc
@ -65,11 +65,14 @@
 #include <openssl/cipher.h>
 #include <openssl/err.h>
 #include <openssl/nid.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
 #include <openssl/span.h>

 #include "../test/file_test.h"
 #include "../test/test_util.h"
 #include "../test/wycheproof_util.h"
+#include "./internal.h"


 static const EVP_CIPHER *GetCipher(const std::string &name) {
@ -474,3 +477,50 @@ TEST(CipherTest, WycheproofAESCBC) {
        }
      });
 }
+
+TEST(CipherTest, SHA1WithSecretSuffix) {
+  uint8_t buf[SHA_CBLOCK * 4];
+  RAND_bytes(buf, sizeof(buf));
+  // Hashing should run in time independent of the bytes.
+  CONSTTIME_SECRET(buf, sizeof(buf));
+
+  // Exhaustively testing interesting cases in this function is cubic in the
+  // block size, so we test in 3-byte increments.
+  constexpr size_t kSkip = 3;
+  // This value should be less than 8 to test the edge case when the 8-byte
+  // length wraps to the next block.
+  static_assert(kSkip < 8, "kSkip is too large");
+
+  // |EVP_sha1_final_with_secret_suffix| is sensitive to the public length of
+  // the partial block previously hashed. In TLS, this is the HMAC prefix, the
+  // header, and the public minimum padding length.
+  for (size_t prefix = 0; prefix < SHA_CBLOCK; prefix += kSkip) {
+    SCOPED_TRACE(prefix);
+    // The first block is treated differently, so we run with up to three
+    // blocks of length variability.
+    for (size_t max_len = 0; max_len < 3 * SHA_CBLOCK; max_len += kSkip) {
+      SCOPED_TRACE(max_len);
+      for (size_t len = 0; len <= max_len; len += kSkip) {
+        SCOPED_TRACE(len);
+
+        uint8_t expected[SHA_DIGEST_LENGTH];
+        SHA1(buf, prefix + len, expected);
+        CONSTTIME_DECLASSIFY(expected, sizeof(expected));
+
+        // Make a copy of the secret length to avoid interfering with the loop.
+        size_t secret_len = len;
+        CONSTTIME_SECRET(&secret_len, sizeof(secret_len));
+
+        SHA_CTX ctx;
+        SHA1_Init(&ctx);
+        SHA1_Update(&ctx, buf, prefix);
+        uint8_t computed[SHA_DIGEST_LENGTH];
+        ASSERT_TRUE(EVP_sha1_final_with_secret_suffix(
+            &ctx, computed, buf + prefix, secret_len, max_len));
+
+        CONSTTIME_DECLASSIFY(computed, sizeof(computed));
+        EXPECT_EQ(Bytes(expected), Bytes(computed));
+      }
+    }
+  }
+}
--- a/crypto/cipher_extra/e_tls.c
+++ b/crypto/cipher_extra/e_tls.c
@ -343,7 +343,7 @@ static int aead_tls_open(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
  if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE &&
      EVP_tls_cbc_record_digest_supported(tls_ctx->hmac_ctx.md)) {
    if (!EVP_tls_cbc_digest_record(tls_ctx->hmac_ctx.md, mac, &mac_len,
-                                   ad_fixed, out, data_plus_mac_len, total,
+                                   ad_fixed, out, data_len, total,
                                   tls_ctx->mac_key, tls_ctx->mac_key_len)) {
      OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
      return 0;
--- a/crypto/cipher_extra/internal.h
+++ b/crypto/cipher_extra/internal.h
@ -99,6 +99,17 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in,
 // which EVP_tls_cbc_digest_record supports.
 int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);

+// EVP_sha1_final_with_secret_suffix computes the result of hashing |len| bytes
+// from |in| to |ctx| and writes the resulting hash to |out|. |len| is treated
+// as secret and must be at most |max_len|, which is treated as public. |in|
+// must point to a buffer of at least |max_len| bytes. It returns one on success
+// and zero if inputs are too long.
+//
+// This function is exported for unit tests.
+OPENSSL_EXPORT int EVP_sha1_final_with_secret_suffix(
+    SHA_CTX *ctx, uint8_t out[SHA_DIGEST_LENGTH], const uint8_t *in, size_t len,
+    size_t max_len);
+
 // EVP_tls_cbc_digest_record computes the MAC of a decrypted, padded TLS
 // record.
 //
@ -108,8 +119,8 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);
 //   md_out_size: the number of output bytes is written here.
 //   header: the 13-byte, TLS record header.
 //   data: the record data itself
-//   data_plus_mac_size: the secret, reported length of the data and MAC
-//     once the padding has been removed.
+//   data_size: the secret, reported length of the data once the padding and MAC
+//     have been removed.
 //   data_plus_mac_plus_padding_size: the public length of the whole
 //     record, including padding.
 //
@ -119,7 +130,7 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md);
 // padding too. )
 int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
                              size_t *md_out_size, const uint8_t header[13],
-                              const uint8_t *data, size_t data_plus_mac_size,
+                              const uint8_t *data, size_t data_size,
                              size_t data_plus_mac_plus_padding_size,
                              const uint8_t *mac_secret,
                              unsigned mac_secret_length);
--- a/crypto/cipher_extra/tls_cbc.c
+++ b/crypto/cipher_extra/tls_cbc.c
@ -62,15 +62,6 @@
 #include "../fipsmodule/cipher/internal.h"


-// MAX_HASH_BIT_COUNT_BYTES is the maximum number of bytes in the hash's length
-// field. (SHA-384/512 have 128-bit length.)
-#define MAX_HASH_BIT_COUNT_BYTES 16
-
-// MAX_HASH_BLOCK_SIZE is the maximum hash block size that we'll support.
-// Currently SHA-384/512 has a 128-byte block size and that's the largest
-// supported by TLS.)
-#define MAX_HASH_BLOCK_SIZE 128
-
 int EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, size_t *out_len,
                               const uint8_t *in, size_t in_len,
                               size_t block_size, size_t mac_size) {
@ -183,25 +174,97 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in,
  OPENSSL_memcpy(out, rotated_mac, md_size);
 }

-// u32toBE serialises an unsigned, 32-bit number (n) as four bytes at (p) in
-// big-endian order. The value of p is advanced by four.
-#define u32toBE(n, p)                \
-  do {                               \
-    *((p)++) = (uint8_t)((n) >> 24); \
-    *((p)++) = (uint8_t)((n) >> 16); \
-    *((p)++) = (uint8_t)((n) >> 8);  \
-    *((p)++) = (uint8_t)((n));       \
-  } while (0)
-
-// These functions serialize the state of a hash and thus perform the standard
-// "final" operation without adding the padding and length that such a function
-// typically does.
-static void tls1_sha1_final_raw(SHA_CTX *ctx, uint8_t *md_out) {
-  u32toBE(ctx->h[0], md_out);
-  u32toBE(ctx->h[1], md_out);
-  u32toBE(ctx->h[2], md_out);
-  u32toBE(ctx->h[3], md_out);
-  u32toBE(ctx->h[4], md_out);
+int EVP_sha1_final_with_secret_suffix(SHA_CTX *ctx,
+                                      uint8_t out[SHA_DIGEST_LENGTH],
+                                      const uint8_t *in, size_t len,
+                                      size_t max_len) {
+  // Bound the input length so |total_bits| below fits in four bytes. This is
+  // redundant with TLS record size limits. This also ensures |input_idx| below
+  // does not overflow.
+  size_t max_len_bits = max_len << 3;
+  if (ctx->Nh != 0 ||
+      (max_len_bits >> 3) != max_len ||  // Overflow
+      ctx->Nl + max_len_bits < max_len_bits ||
+      ctx->Nl + max_len_bits > UINT32_MAX) {
+    return 0;
+  }
+
+  // We need to hash the following into |ctx|:
+  //
+  // - ctx->data[:ctx->num]
+  // - in[:len]
+  // - A 0x80 byte
+  // - However many zero bytes are needed to pad up to a block.
+  // - Eight bytes of length.
+  size_t num_blocks = (ctx->num + len + 1 + 8 + SHA_CBLOCK - 1) >> 6;
+  size_t last_block = num_blocks - 1;
+  size_t max_blocks = (ctx->num + max_len + 1 + 8 + SHA_CBLOCK - 1) >> 6;
+
+  // The bounds above imply |total_bits| fits in four bytes.
+  size_t total_bits = ctx->Nl + (len << 3);
+  uint8_t length_bytes[4];
+  length_bytes[0] = (uint8_t)(total_bits >> 24);
+  length_bytes[1] = (uint8_t)(total_bits >> 16);
+  length_bytes[2] = (uint8_t)(total_bits >> 8);
+  length_bytes[3] = (uint8_t)total_bits;
+
+  // We now construct and process each expected block in constant-time.
+  uint8_t block[SHA_CBLOCK] = {0};
+  uint32_t result[5] = {0};
+  // input_idx is the index into |in| corresponding to the current block.
+  // However, we allow this index to overflow beyond |max_len|, to simplify the
+  // 0x80 byte.
+  size_t input_idx = 0;
+  for (size_t i = 0; i < max_blocks; i++) {
+    // Fill |block| with data from the partial block in |ctx| and |in|. We copy
+    // as if we were hashing up to |max_len| and then zero the excess later.
+    size_t block_start = 0;
+    if (i == 0) {
+      OPENSSL_memcpy(block, ctx->data, ctx->num);
+      block_start = ctx->num;
+    }
+    if (input_idx < max_len) {
+      size_t to_copy = SHA_CBLOCK - block_start;
+      if (to_copy > max_len - input_idx) {
+        to_copy = max_len - input_idx;
+      }
+      OPENSSL_memcpy(block + block_start, in + input_idx, to_copy);
+    }
+
+    // Zero any bytes beyond |len| and add the 0x80 byte.
+    for (size_t j = block_start; j < SHA_CBLOCK; j++) {
+      // input[idx] corresponds to block[j].
+      size_t idx = input_idx + j - block_start;
+      // The barriers on |len| are not strictly necessary. However, without
+      // them, GCC compiles this code by incorporating |len| into the loop
+      // counter and subtracting it out later. This is still constant-time, but
+      // it frustrates attempts to validate this.
+      uint8_t is_in_bounds = constant_time_lt_8(idx, value_barrier_w(len));
+      uint8_t is_padding_byte = constant_time_eq_8(idx, value_barrier_w(len));
+      block[j] &= is_in_bounds;
+      block[j] |= 0x80 & is_padding_byte;
+    }
+
+    input_idx += SHA_CBLOCK - block_start;
+
+    // Fill in the length if this is the last block.
+    crypto_word_t is_last_block = constant_time_eq_w(i, last_block);
+    for (size_t j = 0; j < 4; j++) {
+      block[SHA_CBLOCK - 4 + j] |= is_last_block & length_bytes[j];
+    }
+
+    // Process the block and save the hash state if it is the final value.
+    SHA1_Transform(ctx, block);
+    for (size_t j = 0; j < 5; j++) {
+      result[j] |= is_last_block & ctx->h[j];
+    }
+  }
+
+  // Write the output.
+  for (size_t i = 0; i < 5; i++) {
+    CRYPTO_store_u32_be(out + 4 * i, result[i]);
+  }
+  return 1;
 }

 int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) {
@ -210,18 +273,10 @@ int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) {

 int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
                              size_t *md_out_size, const uint8_t header[13],
-                              const uint8_t *data, size_t data_plus_mac_size,
+                              const uint8_t *data, size_t data_size,
                              size_t data_plus_mac_plus_padding_size,
                              const uint8_t *mac_secret,
                              unsigned mac_secret_length) {
-  // Bound the acceptable input so we can forget about many possible overflows
-  // later in this function. This is redundant with the record size limits in
-  // TLS.
-  if (data_plus_mac_plus_padding_size >= 1024 * 1024) {
-    assert(0);
-    return 0;
-  }
-
  if (EVP_MD_type(md) != NID_sha1) {
      // EVP_tls_cbc_record_digest_supported should have been called first to
      // check that the hash function is supported.
@ -230,178 +285,54 @@ int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
      return 0;
  }

-  // TODO(davidben): Further simplify this logic, now that we're down to one
-  // hash function.
-
-  SHA_CTX md_state;
-  SHA1_Init(&md_state);
-  unsigned md_size = SHA_DIGEST_LENGTH, md_block_size = 64, md_block_shift = 6;
-  // md_length_size is the number of bytes in the length field that terminates
-  // the hash.
-  unsigned md_length_size = 8;
-
-  assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
-  assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
-  assert(md_block_size == (1u << md_block_shift));
-  assert(md_size <= EVP_MAX_MD_SIZE);
-
-  static const size_t kHeaderLength = 13;
-
-  // kVarianceBlocks is the number of blocks of the hash that we have to
-  // calculate in constant time because they could be altered by the
-  // padding value.
-  //
-  // TLSv1 has MACs up to 48 bytes long (SHA-384) and the padding is not
-  // required to be minimal. Therefore we say that the final |kVarianceBlocks|
-  // blocks can vary based on the padding and on the hash used. This value
-  // must be derived from public information.
-  const size_t kVarianceBlocks =
-     ( 255 + 1 + // maximum padding bytes + padding length
-       md_size + // length of hash's output
-       md_block_size - 1 // ceiling
-     ) / md_block_size
-     + 1; // the 0x80 marker and the encoded message length could or not
-          // require an extra block; since the exact value depends on the
-          // message length; thus, one extra block is always added to run
-          // in constant time.
-
-  // From now on we're dealing with the MAC, which conceptually has 13
-  // bytes of `header' before the start of the data.
-  size_t len = data_plus_mac_plus_padding_size + kHeaderLength;
-  // max_mac_bytes contains the maximum bytes of bytes in the MAC, including
-  // |header|, assuming that there's no padding.
-  size_t max_mac_bytes = len - md_size - 1;
-  // num_blocks is the maximum number of hash blocks.
-  size_t num_blocks =
-      (max_mac_bytes + 1 + md_length_size + md_block_size - 1) / md_block_size;
-  // In order to calculate the MAC in constant time we have to handle
-  // the final blocks specially because the padding value could cause the
-  // end to appear somewhere in the final |kVarianceBlocks| blocks and we
-  // can't leak where. However, |num_starting_blocks| worth of data can
-  // be hashed right away because no padding value can affect whether
-  // they are plaintext.
-  size_t num_starting_blocks = 0;
-  // k is the starting byte offset into the conceptual header||data where
-  // we start processing.
-  size_t k = 0;
-  // mac_end_offset is the index just past the end of the data to be MACed.
-  size_t mac_end_offset = data_plus_mac_size + kHeaderLength - md_size;
-  // c is the index of the 0x80 byte in the final hash block that contains
-  // application data.
-  size_t c = mac_end_offset & (md_block_size - 1);
-  // index_a is the hash block number that contains the 0x80 terminating value.
-  size_t index_a = mac_end_offset >> md_block_shift;
-  // index_b is the hash block number that contains the 64-bit hash length, in
-  // bits.
-  size_t index_b = (mac_end_offset + md_length_size) >> md_block_shift;
-
-  if (num_blocks > kVarianceBlocks) {
-    num_starting_blocks = num_blocks - kVarianceBlocks;
-    k = md_block_size * num_starting_blocks;
+  if (mac_secret_length > SHA_CBLOCK) {
+    // HMAC pads small keys with zeros and hashes large keys down. This function
+    // should never reach the large key case.
+    assert(0);
+    return 0;
  }

-  // bits is the hash-length in bits. It includes the additional hash
-  // block for the masked HMAC key.
-  size_t bits = 8 * mac_end_offset;  // at most 18 bits to represent
-
  // Compute the initial HMAC block.
-  bits += 8 * md_block_size;
-  // hmac_pad is the masked HMAC key.
-  uint8_t hmac_pad[MAX_HASH_BLOCK_SIZE];
-  OPENSSL_memset(hmac_pad, 0, md_block_size);
-  assert(mac_secret_length <= sizeof(hmac_pad));
+  uint8_t hmac_pad[SHA_CBLOCK];
+  OPENSSL_memset(hmac_pad, 0, sizeof(hmac_pad));
  OPENSSL_memcpy(hmac_pad, mac_secret, mac_secret_length);
-  for (size_t i = 0; i < md_block_size; i++) {
+  for (size_t i = 0; i < SHA_CBLOCK; i++) {
    hmac_pad[i] ^= 0x36;
  }

-  SHA1_Transform(&md_state, hmac_pad);
-
-  // The length check means |bits| fits in four bytes.
-  uint8_t length_bytes[MAX_HASH_BIT_COUNT_BYTES];
-  OPENSSL_memset(length_bytes, 0, md_length_size - 4);
-  length_bytes[md_length_size - 4] = (uint8_t)(bits >> 24);
-  length_bytes[md_length_size - 3] = (uint8_t)(bits >> 16);
-  length_bytes[md_length_size - 2] = (uint8_t)(bits >> 8);
-  length_bytes[md_length_size - 1] = (uint8_t)bits;
-
-  if (k > 0) {
-    // k is a multiple of md_block_size.
-    uint8_t first_block[MAX_HASH_BLOCK_SIZE];
-    OPENSSL_memcpy(first_block, header, 13);
-    OPENSSL_memcpy(first_block + 13, data, md_block_size - 13);
-    SHA1_Transform(&md_state, first_block);
-    for (size_t i = 1; i < k / md_block_size; i++) {
-      SHA1_Transform(&md_state, data + md_block_size * i - 13);
-    }
-  }
+  SHA_CTX ctx;
+  SHA1_Init(&ctx);
+  SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK);
+  SHA1_Update(&ctx, header, 13);

-  uint8_t mac_out[EVP_MAX_MD_SIZE];
-  OPENSSL_memset(mac_out, 0, sizeof(mac_out));
-
-  // We now process the final hash blocks. For each block, we construct
-  // it in constant time. If the |i==index_a| then we'll include the 0x80
-  // bytes and zero pad etc. For each block we selectively copy it, in
-  // constant time, to |mac_out|.
-  for (size_t i = num_starting_blocks;
-       i <= num_starting_blocks + kVarianceBlocks; i++) {
-    uint8_t block[MAX_HASH_BLOCK_SIZE];
-    uint8_t is_block_a = constant_time_eq_8(i, index_a);
-    uint8_t is_block_b = constant_time_eq_8(i, index_b);
-    for (size_t j = 0; j < md_block_size; j++) {
-      uint8_t b = 0;
-      if (k < kHeaderLength) {
-        b = header[k];
-      } else if (k < data_plus_mac_plus_padding_size + kHeaderLength) {
-        b = data[k - kHeaderLength];
-      }
-      k++;
-
-      uint8_t is_past_c = is_block_a & constant_time_ge_8(j, c);
-      uint8_t is_past_cp1 = is_block_a & constant_time_ge_8(j, c + 1);
-      // If this is the block containing the end of the
-      // application data, and we are at the offset for the
-      // 0x80 value, then overwrite b with 0x80.
-      b = constant_time_select_8(is_past_c, 0x80, b);
-      // If this the the block containing the end of the
-      // application data and we're past the 0x80 value then
-      // just write zero.
-      b = b & ~is_past_cp1;
-      // If this is index_b (the final block), but not
-      // index_a (the end of the data), then the 64-bit
-      // length didn't fit into index_a and we're having to
-      // add an extra block of zeros.
-      b &= ~is_block_b | is_block_a;
-
-      // The final bytes of one of the blocks contains the
-      // length.
-      if (j >= md_block_size - md_length_size) {
-        // If this is index_b, write a length byte.
-        b = constant_time_select_8(
-            is_block_b, length_bytes[j - (md_block_size - md_length_size)], b);
-      }
-      block[j] = b;
+  // There are at most 256 bytes of padding, so we can compute the public
+  // minimum length for |data_size|.
+  size_t min_data_size = 0;
+  if (data_plus_mac_plus_padding_size > SHA_DIGEST_LENGTH + 256) {
+    min_data_size = data_plus_mac_plus_padding_size - SHA_DIGEST_LENGTH - 256;
  }

-    SHA1_Transform(&md_state, block);
-    tls1_sha1_final_raw(&md_state, block);
-    // If this is index_b, copy the hash value to |mac_out|.
-    for (size_t j = 0; j < md_size; j++) {
-      mac_out[j] |= block[j] & is_block_b;
-    }
-  }
+  // Hash the public minimum length directly. This reduces the number of blocks
+  // that must be computed in constant-time.
+  SHA1_Update(&ctx, data, min_data_size);

-  SHA_CTX md_ctx;
-  SHA1_Init(&md_ctx);
+  // Hash the remaining data without leaking |data_size|.
+  uint8_t mac_out[SHA_DIGEST_LENGTH];
+  if (!EVP_sha1_final_with_secret_suffix(
+          &ctx, mac_out, data + min_data_size, data_size - min_data_size,
+          data_plus_mac_plus_padding_size - min_data_size)) {
+    return 0;
+  }

  // Complete the HMAC in the standard manner.
-  for (size_t i = 0; i < md_block_size; i++) {
+  SHA1_Init(&ctx);
+  for (size_t i = 0; i < SHA_CBLOCK; i++) {
    hmac_pad[i] ^= 0x6a;
  }

-  SHA1_Update(&md_ctx, hmac_pad, md_block_size);
-  SHA1_Update(&md_ctx, mac_out, md_size);
-  SHA1_Final(md_out, &md_ctx);
-  *md_out_size = md_size;
+  SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK);
+  SHA1_Update(&ctx, mac_out, SHA_DIGEST_LENGTH);
+  SHA1_Final(md_out, &ctx);
+  *md_out_size = SHA_DIGEST_LENGTH;
  return 1;
 }