Remove unions in EC_SCALAR and EC_FELEM.

When introducing EC_SCALAR and EC_FELEM, I used unions as convenience for converting to and from the byte representation. However, type-punning with unions is not allowed in C++ and hard to use correctly in C. As I understand the rules, they are: - The abstract machine knows what member of union was last written to. - In C, reading from an inactive member is defined to type-pun. In C++, it is UB though some compilers promise the C behavior anyway. - However, if you read or write from a *pointer* to a union member, the strict aliasing rule applies. (A function passed two pointers of different types otherwise needs to pessimally assume they came from the same union.) That last rule means the type-punning allowance doesn't apply if you take a pointer to an inactive member, and it's common to abstract otherwise direct accesses of members via pointers. https://github.com/openssl/openssl/issues/18225 is an example where similar union tricks have caused problems for OpenSSL. While we don't have that code, EC_SCALAR and EC_FELEM play similar tricks. We do get a second lifeline because our alternate view is a uint8_t, which we require to be unsigned char. Strict aliasing always allows the pointer type to be a character type, so pointer-indirected accesses of EC_SCALAR.bytes aren't necessarily UB. But if we ever write to EC_SCALAR.bytes directly (and we do), we'll switch the active arm and then pointers to EC_SCALAR.words become strict aliasing violations! This is all far too complicated to deal with. Ideally everyone would build with -fno-strict-aliasing because no real C code actually follows these rules. But we don't always control our downstream consumers' CFLAGS, so let's just avoid the union. This also avoids a pitfall if we ever move libcrypto to C++. For p224-64.c, I just converted the representations directly, which avoids worrying about the top 32 bits in p224_felem_to_generic. Most of the rest was words vs. bytes conversions and boils down to a cast (we're still dealing with a character type, at the end of the day). But I took the opportunity to extract some more "words"-based helper functions out of BIGNUM, so the casts would only be in one place. That too saves us from the top bits problem in the bytes-to-words direction. Bug: 301 Change-Id: I3285a86441daaf824a4f6862e825d463a669efdb Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52505 Commit-Queue: Bob Beck <bbe@google.com> Reviewed-by: Bob Beck <bbe@google.com>
3 years ago · 227ff6e642
parent 3f180b8221
commit 227ff6e642
13 changed files with 219 additions and 201 deletions
--- a/crypto/fipsmodule/bn/bytes.c
+++ b/crypto/fipsmodule/bn/bytes.c
@ -61,19 +61,38 @@
 #include "internal.h"
 void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
                            size_t in_len) {
  for (size_t i = 0; i < out_len; i++) {
    if (in_len < sizeof(BN_ULONG)) {
      // Load the last partial word.
      BN_ULONG word = 0;
      for (size_t j = 0; j < in_len; j++) {
        word = (word << 8) | in[j];
      }
      in_len = 0;
      out[i] = word;
      // Fill the remainder with zeros.
      OPENSSL_memset(out + i + 1, 0, (out_len - i - 1) * sizeof(BN_ULONG));
      break;
    }
-BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
+    in_len -= sizeof(BN_ULONG);
-  size_t num_words;
+    out[i] = CRYPTO_load_word_be(in + in_len);
  unsigned m;
  BN_ULONG word = 0;
  BIGNUM *bn = NULL;
  if (ret == NULL) {
    ret = bn = BN_new();
  }
  // The caller should have sized the output to avoid truncation.
  assert(in_len == 0);
 }
 BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  BIGNUM *bn = NULL;
  if (ret == NULL) {
-    return NULL;
+    bn = BN_new();
    if (bn == NULL) {
      return NULL;
    }
    ret = bn;
  }
  if (len == 0) {
@ -81,12 +100,9 @@ BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
    return ret;
  }
-  num_words = ((len - 1) / BN_BYTES) + 1;
+  size_t num_words = ((len - 1) / BN_BYTES) + 1;
  m = (len - 1) % BN_BYTES;
  if (!bn_wexpand(ret, num_words)) {
-    if (bn) {
+    BN_free(bn);
      BN_free(bn);
    }
    return NULL;
  }
@ -96,15 +112,7 @@ BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  ret->width = (int)num_words;
  ret->neg = 0;
-  while (len--) {
+  bn_big_endian_to_words(ret->d, ret->width, in, len);
    word = (word << 8) | *(in++);
    if (m-- == 0) {
      ret->d[--num_words] = word;
      word = 0;
      m = BN_BYTES - 1;
    }
  }
  return ret;
 }
@ -112,13 +120,12 @@ BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  BIGNUM *bn = NULL;
  if (ret == NULL) {
    bn = BN_new();
    if (bn == NULL) {
      return NULL;
    }
    ret = bn;
  }
  if (ret == NULL) {
    return NULL;
  }
  if (len == 0) {
    ret->width = 0;
    ret->neg = 0;
@ -142,38 +149,58 @@ BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
  return ret;
 }
-size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
+// fits_in_bytes returns one if the |num_words| words in |words| can be
-  size_t n, i;
+// represented in |num_bytes| bytes.
-  BN_ULONG l;
+static int fits_in_bytes(const BN_ULONG *words, size_t num_words,
-
+                         size_t num_bytes) {
-  n = i = BN_num_bytes(in);
+  const uint8_t *bytes = (const uint8_t *)words;
-  while (i--) {
+  size_t tot_bytes = num_words * sizeof(BN_ULONG);
    l = in->d[i / BN_BYTES];
    *(out++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
  }
  return n;
 }
 static int fits_in_bytes(const uint8_t *bytes, size_t num_bytes, size_t len) {
  uint8_t mask = 0;
-  for (size_t i = len; i < num_bytes; i++) {
+  for (size_t i = num_bytes; i < tot_bytes; i++) {
    mask |= bytes[i];
  }
  return mask == 0;
 }
 void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
                            size_t in_len) {
  // The caller should have selected an output length without truncation.
  assert(fits_in_bytes(in, in_len, out_len));
  // We only support little-endian platforms, so the internal representation is
  // also little-endian as bytes. We can simply copy it in reverse.
  const uint8_t *bytes = (const uint8_t *)in;
  size_t num_bytes = in_len * sizeof(BN_ULONG);
  if (out_len < num_bytes) {
    num_bytes = out_len;
  }
  for (size_t i = 0; i < num_bytes; i++) {
    out[out_len - i - 1] = bytes[i];
  }
  // Pad out the rest of the buffer with zeroes.
  OPENSSL_memset(out, 0, out_len - num_bytes);
 }
 size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
  size_t n = BN_num_bytes(in);
  bn_words_to_big_endian(out, n, in->d, in->width);
  return n;
 }
 int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
  if (!fits_in_bytes(in->d, in->width, len)) {
    return 0;
  }
  // We only support little-endian platforms, so we can simply memcpy into the
  // internal representation.
  const uint8_t *bytes = (const uint8_t *)in->d;
  size_t num_bytes = in->width * BN_BYTES;
  if (len < num_bytes) {
    if (!fits_in_bytes(bytes, num_bytes, len)) {
      return 0;
    }
    num_bytes = len;
  }
  // We only support little-endian platforms, so we can simply memcpy into the
  // internal representation.
  OPENSSL_memcpy(out, bytes, num_bytes);
  // Pad out the rest of the buffer with zeroes.
  OPENSSL_memset(out + num_bytes, 0, len - num_bytes);
@ -181,22 +208,11 @@ int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
 }
 int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
-  const uint8_t *bytes = (const uint8_t *)in->d;
+  if (!fits_in_bytes(in->d, in->width, len)) {
-  size_t num_bytes = in->width * BN_BYTES;
+    return 0;
  if (len < num_bytes) {
    if (!fits_in_bytes(bytes, num_bytes, len)) {
      return 0;
    }
    num_bytes = len;
  }
-  // We only support little-endian platforms, so we can simply write the buffer
+  bn_words_to_big_endian(out, len, in->d, in->width);
  // in reverse.
  for (size_t i = 0; i < num_bytes; i++) {
    out[len - i - 1] = bytes[i];
  }
  // Pad out the rest of the buffer with zeroes.
  OPENSSL_memset(out, 0, len - num_bytes);
  return 1;
 }
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@ -708,6 +708,25 @@ void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a,
                                      size_t num, const BN_MONT_CTX *mont);
 // Word-based byte conversion functions.
 // bn_big_endian_to_words interprets |in_len| bytes from |in| as a big-endian,
 // unsigned integer and writes the result to |out_len| words in |out|. |out_len|
 // must be large enough to represent any |in_len|-byte value. That is, |out_len|
 // must be at least |BN_BYTES * in_len|.
 void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
                            size_t in_len);
 // bn_words_to_big_endian represents |in_len| words from |in| as a big-endian,
 // unsigned integer in |out_len| bytes. It writes the result to |out|. |out_len|
 // must be large enough to represent |in| without truncation.
 //
 // Note |out_len| may be less than |BN_BYTES * in_len| if |in| is known to have
 // leading zeros.
 void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
                            size_t in_len);
 #if defined(__cplusplus)
 }  // extern C
 #endif
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@ -1175,15 +1175,12 @@ int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out,
    return 0;
  }
-  // For simplicity, in case of width mismatches between |group->field| and
+  // The x-coordinate is bounded by p, but we need a scalar, bounded by the
-  // |group->order|, zero any untouched words in |out|.
+  // order. These may not have the same size. However, we must have p < 2×order,
-  OPENSSL_memset(out, 0, sizeof(EC_SCALAR));
+  // assuming p is not tiny (p >= 17).
-  for (size_t i = 0; i < len; i++) {
+  //
-    out->bytes[len - i - 1] = bytes[i];
+  // Thus |bytes| will fit in |order.width + 1| words, and we can reduce by
-  }
+  // performing at most one subtraction.
  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
  // can reduce by performing at most one subtraction.
  //
  // Proof: We only work with prime order curves, so the number of points on
  // the curve is the order. Thus Hasse's theorem gives:
@ -1197,14 +1194,11 @@ int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out,
  //
  // Additionally, one can manually check this property for built-in curves. It
  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
-
+  const BIGNUM *order = &group->order;
-  // The above does not guarantee |group->field| is not one word larger than
+  BN_ULONG words[EC_MAX_WORDS + 1];
-  // |group->order|, so read one extra carry word.
+  bn_big_endian_to_words(words, order->width + 1, bytes, len);
-  BN_ULONG tmp[EC_MAX_WORDS];
+  bn_reduce_once(out->words, words, /*carry=*/words[order->width], order->d,
-  BN_ULONG carry =
+                 order->width);
      group->order.width < EC_MAX_WORDS ? out->words[group->order.width] : 0;
  bn_reduce_once_in_place(out->words, carry, group->order.d, tmp,
                          group->order.width);
  return 1;
 }
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@ -100,9 +100,7 @@ OPENSSL_STATIC_ASSERT(EC_MAX_WORDS <= BN_SMALL_MAX_WORDS,
 // An EC_SCALAR is an integer fully reduced modulo the order. Only the first
 // |order->width| words are used. An |EC_SCALAR| is specific to an |EC_GROUP|
 // and must not be mixed between groups.
-typedef union {
+typedef struct {
  // bytes is the representation of the scalar in little-endian order.
  uint8_t bytes[EC_MAX_BYTES];
  BN_ULONG words[EC_MAX_WORDS];
 } EC_SCALAR;
@ -192,9 +190,7 @@ void ec_scalar_select(const EC_GROUP *group, EC_SCALAR *out, BN_ULONG mask,
 // are used. An |EC_FELEM| is specific to an |EC_GROUP| and must not be mixed
 // between groups. Additionally, the representation (whether or not elements are
 // represented in Montgomery-form) may vary between |EC_METHOD|s.
-typedef union {
+typedef struct {
  // bytes is the representation of the field element in little-endian order.
  uint8_t bytes[EC_MAX_BYTES];
  BN_ULONG words[EC_MAX_WORDS];
 } EC_FELEM;
--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@ -52,11 +52,6 @@ typedef uint128_t p224_widelimb;
 typedef p224_limb p224_felem[4];
 typedef p224_widelimb p224_widefelem[7];
 // Field element represented as a byte arrary. 28*8 = 224 bits is also the
 // group order size for the elliptic curve, and we also use this type for
 // scalars for point multiplication.
 typedef uint8_t p224_felem_bytearray[28];
 // Precomputed multiples of the standard generator
 // Points are given in coordinates (X, Y, Z) where Z normally is 1
 // (0 for the point at infinity).
@ -180,31 +175,16 @@ static const p224_felem g_p224_pre_comp[2][16][3] = {
      {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
      {1, 0, 0, 0}}}};
 static uint64_t p224_load_u64(const uint8_t in[8]) {
  uint64_t ret;
  OPENSSL_memcpy(&ret, in, sizeof(ret));
  return ret;
 }
 // Helper functions to convert field elements to/from internal representation
 static void p224_bin28_to_felem(p224_felem out, const uint8_t in[28]) {
  out[0] = p224_load_u64(in) & 0x00ffffffffffffff;
  out[1] = p224_load_u64(in + 7) & 0x00ffffffffffffff;
  out[2] = p224_load_u64(in + 14) & 0x00ffffffffffffff;
  out[3] = p224_load_u64(in + 20) >> 8;
 }
 static void p224_felem_to_bin28(uint8_t out[28], const p224_felem in) {
  for (size_t i = 0; i < 7; ++i) {
    out[i] = in[0] >> (8 * i);
    out[i + 7] = in[1] >> (8 * i);
    out[i + 14] = in[2] >> (8 * i);
    out[i + 21] = in[3] >> (8 * i);
  }
 }
 static void p224_generic_to_felem(p224_felem out, const EC_FELEM *in) {
-  p224_bin28_to_felem(out, in->bytes);
+  // |p224_felem|'s minimal representation uses four 56-bit words. |EC_FELEM|
  // uses four 64-bit words. (The top-most word only has 32 bits.)
  out[0] = in->words[0] & 0x00ffffffffffffff;
  out[1] = ((in->words[0] >> 56) | (in->words[1] << 8)) & 0x00ffffffffffffff;
  out[2] = ((in->words[1] >> 48) | (in->words[2] << 16)) & 0x00ffffffffffffff;
  out[3] = ((in->words[2] >> 40) | (in->words[3] << 24)) & 0x00ffffffffffffff;
 }
 // Requires 0 <= in < 2*p (always call p224_felem_reduce first)
@ -256,9 +236,12 @@ static void p224_felem_to_generic(EC_FELEM *out, const p224_felem in) {
  tmp2[2] = tmp[2];
  tmp2[3] = tmp[3];
-  p224_felem_to_bin28(out->bytes, tmp2);
+  // |p224_felem|'s minimal representation uses four 56-bit words. |EC_FELEM|
-  // 224 is not a multiple of 64, so zero the remaining bytes.
+  // uses four 64-bit words. (The top-most word only has 32 bits.)
-  OPENSSL_memset(out->bytes + 28, 0, 32 - 28);
+  out->words[0] = tmp2[0] | (tmp2[1] << 56);
  out->words[1] = (tmp2[1] >> 8) | (tmp2[2] << 48);
  out->words[2] = (tmp2[2] >> 16) | (tmp2[3] << 40);
  out->words[3] = tmp2[3] >> 24;
 }
@ -865,12 +848,13 @@ static void p224_select_point(const uint64_t idx, size_t size,
  }
 }
-// p224_get_bit returns the |i|th bit in |in|
+// p224_get_bit returns the |i|th bit in |in|.
-static crypto_word_t p224_get_bit(const p224_felem_bytearray in, size_t i) {
+static crypto_word_t p224_get_bit(const EC_SCALAR *in, size_t i) {
  if (i >= 224) {
    return 0;
  }
-  return (in[i >> 3] >> (i & 7)) & 1;
+  static_assert(sizeof(in->words[0]) == 8, "BN_ULONG is not 64-bit");
  return (in->words[i >> 6] >> (i & 63)) & 1;
 }
 // Takes the Jacobian coordinates (X, Y, Z) of a point and returns
@ -977,12 +961,12 @@ static void ec_GFp_nistp224_point_mul(const EC_GROUP *group, EC_RAW_POINT *r,
    // Add every 5 doublings.
    if (i % 5 == 0) {
-      crypto_word_t bits = p224_get_bit(scalar->bytes, i + 4) << 5;
+      crypto_word_t bits = p224_get_bit(scalar, i + 4) << 5;
-      bits |= p224_get_bit(scalar->bytes, i + 3) << 4;
+      bits |= p224_get_bit(scalar, i + 3) << 4;
-      bits |= p224_get_bit(scalar->bytes, i + 2) << 3;
+      bits |= p224_get_bit(scalar, i + 2) << 3;
-      bits |= p224_get_bit(scalar->bytes, i + 1) << 2;
+      bits |= p224_get_bit(scalar, i + 1) << 2;
-      bits |= p224_get_bit(scalar->bytes, i) << 1;
+      bits |= p224_get_bit(scalar, i) << 1;
-      bits |= p224_get_bit(scalar->bytes, i - 1);
+      bits |= p224_get_bit(scalar, i - 1);
      crypto_word_t sign, digit;
      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
@ -1022,10 +1006,10 @@ static void ec_GFp_nistp224_point_mul_base(const EC_GROUP *group,
    }
    // First, look 28 bits upwards.
-    crypto_word_t bits = p224_get_bit(scalar->bytes, i + 196) << 3;
+    crypto_word_t bits = p224_get_bit(scalar, i + 196) << 3;
-    bits |= p224_get_bit(scalar->bytes, i + 140) << 2;
+    bits |= p224_get_bit(scalar, i + 140) << 2;
-    bits |= p224_get_bit(scalar->bytes, i + 84) << 1;
+    bits |= p224_get_bit(scalar, i + 84) << 1;
-    bits |= p224_get_bit(scalar->bytes, i + 28);
+    bits |= p224_get_bit(scalar, i + 28);
    // Select the point to add, in constant time.
    p224_select_point(bits, 16, g_p224_pre_comp[1], tmp);
@ -1038,10 +1022,10 @@ static void ec_GFp_nistp224_point_mul_base(const EC_GROUP *group,
    }
    // Second, look at the current position/
-    bits = p224_get_bit(scalar->bytes, i + 168) << 3;
+    bits = p224_get_bit(scalar, i + 168) << 3;
-    bits |= p224_get_bit(scalar->bytes, i + 112) << 2;
+    bits |= p224_get_bit(scalar, i + 112) << 2;
-    bits |= p224_get_bit(scalar->bytes, i + 56) << 1;
+    bits |= p224_get_bit(scalar, i + 56) << 1;
-    bits |= p224_get_bit(scalar->bytes, i);
+    bits |= p224_get_bit(scalar, i);
    // Select the point to add, in constant time.
    p224_select_point(bits, 16, g_p224_pre_comp[0], tmp);
    p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
@ -1080,10 +1064,10 @@ static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group,
    // Add multiples of the generator.
    if (i <= 27) {
      // First, look 28 bits upwards.
-      crypto_word_t bits = p224_get_bit(g_scalar->bytes, i + 196) << 3;
+      crypto_word_t bits = p224_get_bit(g_scalar, i + 196) << 3;
-      bits |= p224_get_bit(g_scalar->bytes, i + 140) << 2;
+      bits |= p224_get_bit(g_scalar, i + 140) << 2;
-      bits |= p224_get_bit(g_scalar->bytes, i + 84) << 1;
+      bits |= p224_get_bit(g_scalar, i + 84) << 1;
-      bits |= p224_get_bit(g_scalar->bytes, i + 28);
+      bits |= p224_get_bit(g_scalar, i + 28);
      size_t index = (size_t)bits;
      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
@ -1092,10 +1076,10 @@ static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group,
      assert(!skip);
      // Second, look at the current position.
-      bits = p224_get_bit(g_scalar->bytes, i + 168) << 3;
+      bits = p224_get_bit(g_scalar, i + 168) << 3;
-      bits |= p224_get_bit(g_scalar->bytes, i + 112) << 2;
+      bits |= p224_get_bit(g_scalar, i + 112) << 2;
-      bits |= p224_get_bit(g_scalar->bytes, i + 56) << 1;
+      bits |= p224_get_bit(g_scalar, i + 56) << 1;
-      bits |= p224_get_bit(g_scalar->bytes, i);
+      bits |= p224_get_bit(g_scalar, i);
      index = (size_t)bits;
      p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
                     g_p224_pre_comp[0][index][0], g_p224_pre_comp[0][index][1],
@ -1104,12 +1088,12 @@ static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group,
    // Incorporate |p_scalar| every 5 doublings.
    if (i % 5 == 0) {
-      crypto_word_t bits = p224_get_bit(p_scalar->bytes, i + 4) << 5;
+      crypto_word_t bits = p224_get_bit(p_scalar, i + 4) << 5;
-      bits |= p224_get_bit(p_scalar->bytes, i + 3) << 4;
+      bits |= p224_get_bit(p_scalar, i + 3) << 4;
-      bits |= p224_get_bit(p_scalar->bytes, i + 2) << 3;
+      bits |= p224_get_bit(p_scalar, i + 2) << 3;
-      bits |= p224_get_bit(p_scalar->bytes, i + 1) << 2;
+      bits |= p224_get_bit(p_scalar, i + 1) << 2;
-      bits |= p224_get_bit(p_scalar->bytes, i) << 1;
+      bits |= p224_get_bit(p_scalar, i) << 1;
-      bits |= p224_get_bit(p_scalar->bytes, i - 1);
+      bits |= p224_get_bit(p_scalar, i - 1);
      crypto_word_t sign, digit;
      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@ -201,7 +201,7 @@ static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r,
  // ~1599 ((96 * 16) + 63) bytes of stack space.
  alignas(64) P256_POINT table[16];
  uint8_t p_str[33];
-  OPENSSL_memcpy(p_str, p_scalar->bytes, 32);
+  OPENSSL_memcpy(p_str, p_scalar->words, 32);
  p_str[32] = 0;
  // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
@ -321,7 +321,7 @@ static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_RAW_POINT *r,
  alignas(32) p256_point_union_t t, p;
  uint8_t p_str[33];
-  OPENSSL_memcpy(p_str, scalar->bytes, 32);
+  OPENSSL_memcpy(p_str, scalar->words, 32);
  p_str[32] = 0;
  // First window
@ -366,7 +366,7 @@ static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
  alignas(32) p256_point_union_t t, p;
  uint8_t p_str[33];
-  OPENSSL_memcpy(p_str, g_scalar->bytes, 32);
+  OPENSSL_memcpy(p_str, g_scalar->words, 32);
  p_str[32] = 0;
  // First window
--- a/crypto/fipsmodule/ec/p256-nistz_test.cc
+++ b/crypto/fipsmodule/ec/p256-nistz_test.cc
@ -149,8 +149,8 @@ TEST(P256_NistzTest, BEEU) {
    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
    // Calculate out*in and confirm that it equals one, modulo the order.
-    OPENSSL_memcpy(in_scalar.bytes, in, sizeof(in));
+    OPENSSL_memcpy(in_scalar.words, in, sizeof(in));
-    OPENSSL_memcpy(out_scalar.bytes, out, sizeof(out));
+    OPENSSL_memcpy(out_scalar.words, out, sizeof(out));
    ec_scalar_to_montgomery(group.get(), &in_scalar, &in_scalar);
    ec_scalar_to_montgomery(group.get(), &out_scalar, &out_scalar);
    ec_scalar_mul_montgomery(group.get(), &result, &in_scalar, &out_scalar);
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@ -81,17 +81,22 @@ static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
  fiat_p256_selectznz(out, !!t, z, nz);
 }
 static void fiat_p256_from_words(fiat_p256_felem out,
                                 const BN_ULONG in[32 / sizeof(BN_ULONG)]) {
  // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
  // 64-bit platforms without |uint128_t|, they are different. However, on
  // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
  // layout.
  OPENSSL_memcpy(out, in, 32);
 }
 static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) {
-  fiat_p256_from_bytes(out, in->bytes);
+  fiat_p256_from_words(out, in->words);
 }
 static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) {
-  // This works because 256 is a multiple of 64, so there are no excess bytes to
+  // See |fiat_p256_from_words|.
-  // zero when rounding up to |BN_ULONG|s.
+  OPENSSL_memcpy(out->words, in, 32);
  OPENSSL_STATIC_ASSERT(
      256 / 8 == sizeof(BN_ULONG) * ((256 + BN_BITS2 - 1) / BN_BITS2),
      "fiat_p256_to_bytes leaves bytes uninitialized");
  fiat_p256_to_bytes(out->bytes, in);
 }
 // fiat_p256_inv_square calculates |out| = |in|^{-2}
@ -394,12 +399,18 @@ static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
  }
 }
-// fiat_p256_get_bit returns the |i|th bit in |in|
+// fiat_p256_get_bit returns the |i|th bit in |in|.
-static crypto_word_t fiat_p256_get_bit(const uint8_t *in, int i) {
+static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) {
  if (i < 0 || i >= 256) {
    return 0;
  }
-  return (in[i >> 3] >> (i & 7)) & 1;
+#if defined(OPENSSL_64_BIT)
  static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit");
  return (in->words[i >> 6] >> (i & 63)) & 1;
 #else
  static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit");
  return (in->words[i >> 5] >> (i & 31)) & 1;
 #endif
 }
 // OPENSSL EC_METHOD FUNCTIONS
@ -500,12 +511,12 @@ static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_RAW_POINT *r,
    // do other additions every 5 doublings
    if (i % 5 == 0) {
-      crypto_word_t bits = fiat_p256_get_bit(scalar->bytes, i + 4) << 5;
+      crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
-      bits |= fiat_p256_get_bit(scalar->bytes, i + 3) << 4;
+      bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
-      bits |= fiat_p256_get_bit(scalar->bytes, i + 2) << 3;
+      bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
-      bits |= fiat_p256_get_bit(scalar->bytes, i + 1) << 2;
+      bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
-      bits |= fiat_p256_get_bit(scalar->bytes, i) << 1;
+      bits |= fiat_p256_get_bit(scalar, i) << 1;
-      bits |= fiat_p256_get_bit(scalar->bytes, i - 1);
+      bits |= fiat_p256_get_bit(scalar, i - 1);
      crypto_word_t sign, digit;
      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
@ -545,10 +556,10 @@ static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
    }
    // First, look 32 bits upwards.
-    crypto_word_t bits = fiat_p256_get_bit(scalar->bytes, i + 224) << 3;
+    crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
-    bits |= fiat_p256_get_bit(scalar->bytes, i + 160) << 2;
+    bits |= fiat_p256_get_bit(scalar, i + 160) << 2;
-    bits |= fiat_p256_get_bit(scalar->bytes, i + 96) << 1;
+    bits |= fiat_p256_get_bit(scalar, i + 96) << 1;
-    bits |= fiat_p256_get_bit(scalar->bytes, i + 32);
+    bits |= fiat_p256_get_bit(scalar, i + 32);
    // Select the point to add, in constant time.
    fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
                                  fiat_p256_g_pre_comp[1], tmp);
@ -564,10 +575,10 @@ static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
    }
    // Second, look at the current position.
-    bits = fiat_p256_get_bit(scalar->bytes, i + 192) << 3;
+    bits = fiat_p256_get_bit(scalar, i + 192) << 3;
-    bits |= fiat_p256_get_bit(scalar->bytes, i + 128) << 2;
+    bits |= fiat_p256_get_bit(scalar, i + 128) << 2;
-    bits |= fiat_p256_get_bit(scalar->bytes, i + 64) << 1;
+    bits |= fiat_p256_get_bit(scalar, i + 64) << 1;
-    bits |= fiat_p256_get_bit(scalar->bytes, i);
+    bits |= fiat_p256_get_bit(scalar, i);
    // Select the point to add, in constant time.
    fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
                                  fiat_p256_g_pre_comp[0], tmp);
@ -617,10 +628,10 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
    // constant-time lookup.
    if (i <= 31) {
      // First, look 32 bits upwards.
-      crypto_word_t bits = fiat_p256_get_bit(g_scalar->bytes, i + 224) << 3;
+      crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i + 160) << 2;
+      bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i + 96) << 1;
+      bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i + 32);
+      bits |= fiat_p256_get_bit(g_scalar, i + 32);
      if (bits != 0) {
        size_t index = (size_t)(bits - 1);
        fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
@ -631,10 +642,10 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
      }
      // Second, look at the current position.
-      bits = fiat_p256_get_bit(g_scalar->bytes, i + 192) << 3;
+      bits = fiat_p256_get_bit(g_scalar, i + 192) << 3;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i + 128) << 2;
+      bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i + 64) << 1;
+      bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1;
-      bits |= fiat_p256_get_bit(g_scalar->bytes, i);
+      bits |= fiat_p256_get_bit(g_scalar, i);
      if (bits != 0) {
        size_t index = (size_t)(bits - 1);
        fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
@ -687,7 +698,7 @@ static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group,
  fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont);
  fiat_p256_felem r_Z2;
-  fiat_p256_from_bytes(r_Z2, r->bytes);  // r < order < p, so this is valid.
+  fiat_p256_from_words(r_Z2, r->words);  // r < order < p, so this is valid.
  fiat_p256_mul(r_Z2, r_Z2, Z2_mont);
  fiat_p256_felem X;
--- a/crypto/fipsmodule/ec/scalar.c
+++ b/crypto/fipsmodule/ec/scalar.c
@ -54,9 +54,7 @@ int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
 void ec_scalar_to_bytes(const EC_GROUP *group, uint8_t *out, size_t *out_len,
                        const EC_SCALAR *in) {
  size_t len = BN_num_bytes(&group->order);
-  for (size_t i = 0; i < len; i++) {
+  bn_words_to_big_endian(out, len, in->words, group->order.width);
    out[len - i - 1] = in->bytes[i];
  }
  *out_len = len;
 }
@ -67,11 +65,7 @@ int ec_scalar_from_bytes(const EC_GROUP *group, EC_SCALAR *out,
    return 0;
  }
-  OPENSSL_memset(out, 0, sizeof(EC_SCALAR));
+  bn_big_endian_to_words(out->words, group->order.width, in, len);
  for (size_t i = 0; i < len; i++) {
    out->bytes[i] = in[len - i - 1];
  }
  if (!bn_less_than_words(out->words, group->order.d, group->order.width)) {
    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@ -330,9 +330,7 @@ int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p,
 void ec_GFp_simple_felem_to_bytes(const EC_GROUP *group, uint8_t *out,
                                  size_t *out_len, const EC_FELEM *in) {
  size_t len = BN_num_bytes(&group->field);
-  for (size_t i = 0; i < len; i++) {
+  bn_words_to_big_endian(out, len, in->words, group->field.width);
    out[i] = in->bytes[len - 1 - i];
  }
  *out_len = len;
 }
@ -343,10 +341,7 @@ int ec_GFp_simple_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out,
    return 0;
  }
-  OPENSSL_memset(out, 0, sizeof(EC_FELEM));
+  bn_big_endian_to_words(out->words, group->field.width, in, len);
  for (size_t i = 0; i < len; i++) {
    out->bytes[i] = in[len - 1 - i];
  }
  if (!bn_less_than_words(out->words, group->field.d, group->field.width)) {
    OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR);
--- a/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@ -78,10 +78,7 @@ static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
  if (digest_len > num_bytes) {
    digest_len = num_bytes;
  }
-  OPENSSL_memset(out, 0, sizeof(EC_SCALAR));
+  bn_big_endian_to_words(out->words, order->width, digest, digest_len);
  for (size_t i = 0; i < digest_len; i++) {
    out->bytes[i] = digest[digest_len - 1 - i];
  }
  // If it is still too long, truncate remaining bits with a shift.
  if (8 * digest_len > num_bits) {
--- a/crypto/internal.h
+++ b/crypto/internal.h
@ -901,6 +901,18 @@ static inline void CRYPTO_store_word_le(void *out, crypto_word_t v) {
  OPENSSL_memcpy(out, &v, sizeof(v));
 }
 static inline crypto_word_t CRYPTO_load_word_be(const void *in) {
  crypto_word_t v;
  OPENSSL_memcpy(&v, in, sizeof(v));
 #if defined(OPENSSL_64_BIT)
  static_assert(sizeof(v) == 8, "crypto_word_t has unexpected size");
  return CRYPTO_bswap8(v);
 #else
  static_assert(sizeof(v) == 4, "crypto_word_t has unexpected size");
  return CRYPTO_bswap4(v);
 #endif
 }
 // Bit rotation functions.
 //
--- a/crypto/trust_token/trust_token_test.cc
+++ b/crypto/trust_token/trust_token_test.cc
@ -854,7 +854,7 @@ TEST_P(TrustTokenBadKeyTest, BadKey) {
                          &key->key.y1, &key->key.xs, &key->key.ys};
  // Corrupt private key scalar.
-  scalars[corrupted_key()]->bytes[0] ^= 42;
+  scalars[corrupted_key()]->words[0] ^= 42;
  size_t tokens_issued;
  ASSERT_TRUE(TRUST_TOKEN_ISSUER_issue(