Use packed representation for large Curve25519 table

Did 59000 Ed25519 key generation operations in 1004188us (58753.9 ops/sec) [+8.3%] Did 57000 Ed25519 signing operations in 1005649us (56679.8 ops/sec) [+7.9%] Did 19000 Ed25519 verify operations in 1054380us (18020.1 ops/sec) [-2.0%] Did 61000 Curve25519 base-point multiplication operations in 1007401us (60551.9 ops/sec) [+8.3%] Did 22000 Curve25519 arbitrary point multiplication operations in 1022882us (21507.9 ops/sec) [+0.5%] Change-Id: I14668f658b1ae99850cb0f8938f90f988d0edd0b Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/60107 Reviewed-by: David Benjamin <davidben@google.com> Commit-Queue: David Benjamin <davidben@google.com>
2 years ago · d605df5b6f
parent 4a0393fcf3
commit d605df5b6f
5 changed files with 2962 additions and 7479 deletions
--- a/crypto/constant_time_test.cc
+++ b/crypto/constant_time_test.cc
@ -52,6 +52,7 @@
 #include <limits>

 #include <gtest/gtest.h>
+#include "test/test_util.h"

 #include <openssl/mem.h>
 #include <openssl/rand.h>
@ -169,3 +170,71 @@ TEST(ConstantTimeTest, ValueBarrier) {
    EXPECT_EQ(u64, value_barrier_u64(u64));
  }
 }
+
+TEST(ConstantTimeTest, MemCmov) {
+  for (int i = 0; i < 100; i++) {
+    uint8_t out[256], in[256];
+    RAND_bytes(out, sizeof(out));
+    RAND_bytes(in, sizeof(in));
+
+    uint8_t b = 0;
+    RAND_bytes(&b, 1);
+    b = constant_time_is_zero_8(b & 0xf);
+
+    uint8_t ref_in[256];
+    OPENSSL_memcpy(ref_in, in, sizeof(in));
+
+    uint8_t ref_out[256];
+    OPENSSL_memcpy(ref_out, out, sizeof(out));
+    if (b) {
+      OPENSSL_memcpy(ref_out, in, sizeof(in));
+    }
+
+    CONSTTIME_SECRET(out, sizeof(out));
+    CONSTTIME_SECRET(in, sizeof(in));
+    CONSTTIME_SECRET(&b, 1);
+
+    constant_time_conditional_memcpy(out, in, sizeof(out), b);
+
+    CONSTTIME_DECLASSIFY(&in, sizeof(in));
+    CONSTTIME_DECLASSIFY(&out, sizeof(out));
+
+    EXPECT_EQ(Bytes(in), Bytes(ref_in));
+    EXPECT_EQ(Bytes(out), Bytes(ref_out));
+  }
+}
+
+TEST(ConstantTimeTest, MemCxor) {
+  for (int i = 0; i < 100; i++) {
+    uint8_t out[256], in[256];
+    RAND_bytes(out, sizeof(out));
+    RAND_bytes(in, sizeof(in));
+
+    uint8_t b = 0;
+    RAND_bytes(&b, 1);
+    b = constant_time_is_zero_8(b & 0xf);
+
+    uint8_t ref_in[256];
+    OPENSSL_memcpy(ref_in, in, sizeof(in));
+
+    uint8_t ref_out[256];
+    OPENSSL_memcpy(ref_out, out, sizeof(out));
+    if (b) {
+      for (size_t j = 0; j < sizeof(ref_out); ++j) {
+        ref_out[j] ^= in[j];
+      }
+    }
+
+    CONSTTIME_SECRET(out, sizeof(out));
+    CONSTTIME_SECRET(in, sizeof(in));
+    CONSTTIME_SECRET(&b, 1);
+
+    constant_time_conditional_memxor(out, in, sizeof(out), b);
+
+    CONSTTIME_DECLASSIFY(&in, sizeof(in));
+    CONSTTIME_DECLASSIFY(&out, sizeof(out));
+
+    EXPECT_EQ(Bytes(in), Bytes(ref_in));
+    EXPECT_EQ(Bytes(out), Bytes(ref_out));
+  }
+}
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@ -315,11 +315,6 @@ static void fe_copy_lt(fe_loose *h, const fe *f) {
  static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
  OPENSSL_memmove(h, f, sizeof(fe));
 }
-#if !defined(OPENSSL_SMALL)
-static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
-  OPENSSL_memmove(h, f, sizeof(fe_loose));
-}
-#endif // !defined(OPENSSL_SMALL)

 static void fe_loose_invert(fe *out, const fe_loose *z) {
  fe t0;
@ -698,16 +693,6 @@ void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
  fe_add(&r->T, &trZ, &trT);
 }

-static uint8_t equal(signed char b, signed char c) {
-  uint8_t ub = b;
-  uint8_t uc = c;
-  uint8_t x = ub ^ uc;  // 0: yes; 1..255: no
-  uint32_t y = x;       // 0: yes; 1..255: no
-  y -= 1;               // 4294967295: yes; 0..254: no
-  y >>= 31;             // 1: yes; 0: no
-  return y;
-}
-
 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
  fe_cmov(&t->yplusx, &u->yplusx, b);
  fe_cmov(&t->yminusx, &u->yminusx, b);
@ -754,7 +739,7 @@ void x25519_ge_scalarmult_small_precomp(
    ge_precomp_0(&e);

    for (j = 1; j < 16; j++) {
-      cmov(&e, &multiples[j-1], equal(index, j));
+      cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
    }

    ge_cached cached;
@ -776,35 +761,36 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {

 #else

-static uint8_t negative(signed char b) {
-  uint32_t x = b;
-  x >>= 31;  // 1: yes; 0: no
-  return x;
-}
+static void table_select(ge_precomp *t, const int pos, const signed char b) {
+  uint8_t bnegative = constant_time_msb_w(b);
+  uint8_t babs = b - ((bnegative & b) << 1);

-static void table_select(ge_precomp *t, int pos, signed char b) {
-  ge_precomp minust;
-  uint8_t bnegative = negative(b);
-  uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
+  uint8_t t_bytes[3][32] = {
+      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
+#if defined(__clang__) // materialize for vectorization, 6% speedup
+  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
+#endif
+  static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
+  for (int i = 0; i < 8; i++) {
+    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
+                                     sizeof(t_bytes),
+                                     constant_time_eq_w(babs, 1 + i));
+  }

-  ge_precomp_0(t);
-  cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
-  cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
-  cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
-  cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
-  cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
-  cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
-  cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
-  cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
-  fe_copy_ll(&minust.yplusx, &t->yminusx);
-  fe_copy_ll(&minust.yminusx, &t->yplusx);
+  fe yplusx, yminusx, xy2d;
+  fe_frombytes_strict(&yplusx, t_bytes[0]);
+  fe_frombytes_strict(&yminusx, t_bytes[1]);
+  fe_frombytes_strict(&xy2d, t_bytes[2]);

-  // NOTE: the input table is canonical, but types don't encode it
-  fe tmp;
-  fe_carry(&tmp, &t->xy2d);
-  fe_neg(&minust.xy2d, &tmp);
+  fe_copy_lt(&t->yplusx, &yplusx);
+  fe_copy_lt(&t->yminusx, &yminusx);
+  fe_copy_lt(&t->xy2d, &xy2d);

-  cmov(t, &minust, bnegative);
+  ge_precomp minust;
+  fe_copy_lt(&minust.yplusx, &yminusx);
+  fe_copy_lt(&minust.yminusx, &yplusx);
+  fe_neg(&minust.xy2d, &xy2d);
+  cmov(t, &minust, bnegative>>7);
 }

 // h = a * B
@ -916,7 +902,7 @@ void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
    ge_cached selected;
    ge_cached_0(&selected);
    for (j = 0; j < 16; j++) {
-      cmov_cached(&selected, &Ai[j], equal(j, index));
+      cmov_cached(&selected, &Ai[j], 1&constant_time_eq_w(index, j));
    }

    x25519_ge_add(&t, &u, &selected);
--- a/crypto/curve25519/curve25519_tables.h
+++ b/crypto/curve25519/curve25519_tables.h
--- a/crypto/curve25519/make_curve25519_tables.py
+++ b/crypto/curve25519/make_curve25519_tables.py
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # coding=utf-8
 # Copyright (c) 2020, Google Inc.
 #
@ -14,7 +14,7 @@
 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

-import StringIO
+from io import StringIO
 import subprocess

 # Base field Z_p
@ -76,12 +76,7 @@ def point_mul(s, P):
    return Q

 def to_bytes(x):
-    ret = bytearray(32)
-    for i in range(len(ret)):
-        ret[i] = x % 256
-        x >>= 8
-    assert x == 0
-    return ret
+    return x.to_bytes(32, "little")

 def to_ge_precomp(P):
    # typedef struct {
@ -109,6 +104,9 @@ def to_base_51(x):
    assert x == 0
    return ret

+def to_bytes_literal(x):
+    return "{" + ", ".join(map(hex, to_bytes(x))) + "}"
+
 def to_literal(x):
    ret = "{{\n#if defined(OPENSSL_64_BIT)\n"
    ret += ", ".join(map(str, to_base_51(x)))
@ -140,7 +138,7 @@ def main():
        bi_precomp.append(to_ge_precomp(P))


-    buf = StringIO.StringIO()
+    buf = StringIO()
    buf.write("""/* Copyright (c) 2020, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
@ -190,14 +188,14 @@ static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""")
 #else

 // k25519Precomp[i][j] = (j+1)*256^i*B
-static const ge_precomp k25519Precomp[32][8] = {
+static const uint8_t k25519Precomp[32][8][3][32] = {
 """)
    for child in large_precomp:
        buf.write("{\n")
        for val in child:
            buf.write("{\n")
            for term in val:
-                buf.write(to_literal(term) + ",\n")
+                buf.write(to_bytes_literal(term) + ",\n")
            buf.write("},\n")
        buf.write("},\n")
    buf.write("""};
@ -216,7 +214,7 @@ static const ge_precomp Bi[8] = {
 """)

    proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE)
-    proc.communicate(buf.getvalue())
+    proc.communicate(buf.getvalue().encode("utf8"))

 if __name__ == "__main__":
    main()
--- a/crypto/internal.h
+++ b/crypto/internal.h
@ -265,15 +265,15 @@ OPENSSL_INLINE void OPENSSL_reset_malloc_counter_for_testing(void) {}
 // Pointer utility functions.

 // buffers_alias returns one if |a| and |b| alias and zero otherwise.
-static inline int buffers_alias(const uint8_t *a, size_t a_len,
-                                const uint8_t *b, size_t b_len) {
+static inline int buffers_alias(const void *a, size_t a_bytes,
+                                const void *b, size_t b_bytes) {
  // Cast |a| and |b| to integers. In C, pointer comparisons between unrelated
  // objects are undefined whereas pointer to integer conversions are merely
  // implementation-defined. We assume the implementation defined it in a sane
  // way.
  uintptr_t a_u = (uintptr_t)a;
  uintptr_t b_u = (uintptr_t)b;
-  return a_u + a_len > b_u && b_u + b_len > a_u;
+  return a_u + a_bytes > b_u && b_u + b_bytes > a_u;
 }

 // align_pointer returns |ptr|, advanced to |alignment|. |alignment| must be a
@ -360,6 +360,9 @@ static inline uint64_t value_barrier_u64(uint64_t a) {
  return a;
 }

+// |value_barrier_u8| could be defined as above, but compilers other than
+// clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM.
+
 // constant_time_msb_w returns the given value with the MSB copied to all the
 // other bits.
 static inline crypto_word_t constant_time_msb_w(crypto_word_t a) {
@ -476,16 +479,23 @@ static inline crypto_word_t constant_time_select_w(crypto_word_t mask,
  // to a cmov, it sometimes further transforms it into a branch, which we do
  // not want.
  //
-  // Adding barriers to both |mask| and |~mask| breaks the relationship between
-  // the two, which makes the compiler stick with bitmasks.
-  return (value_barrier_w(mask) & a) | (value_barrier_w(~mask) & b);
+  // Hiding the value of the mask from the compiler evades this transformation.
+  mask = value_barrier_w(mask);
+  return (mask & a) | (~mask & b);
 }

 // constant_time_select_8 acts like |constant_time_select| but operates on
 // 8-bit values.
-static inline uint8_t constant_time_select_8(uint8_t mask, uint8_t a,
+static inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a,
                                             uint8_t b) {
-  return (uint8_t)(constant_time_select_w(mask, a, b));
+  // |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM
+  // Making both |mask| and its value barrier |uint8_t| would allow the compiler
+  // to materialize 0x????..?MM instead, but only clang is that clever.
+  // However, vectorization of bitwise operations seems to work better on
+  // |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to
+  // |uint8_t| after the value barrier but before the bitwise operations.
+  uint8_t m = value_barrier_w(mask);
+  return (m & a) | (~m & b);
 }

 // constant_time_select_int acts like |constant_time_select| but operates on
@ -495,6 +505,34 @@ static inline int constant_time_select_int(crypto_word_t mask, int a, int b) {
                                      (crypto_word_t)(b)));
 }

+// constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if
+// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
+// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
+static inline void constant_time_conditional_memcpy(void *dst, const void *src,
+                                                    const size_t n,
+                                                    const crypto_word_t mask) {
+  assert(!buffers_alias(dst, n, src, n));
+  uint8_t *out = (uint8_t *)dst;
+  const uint8_t *in = (const uint8_t *)src;
+  for (size_t i = 0; i < n; i++) {
+    out[i] = constant_time_select_8(mask, in[i], out[i]);
+  }
+}
+
+// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if
+// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
+// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
+static inline void constant_time_conditional_memxor(void *dst, const void *src,
+                                                    const size_t n,
+                                                    const crypto_word_t mask) {
+  assert(!buffers_alias(dst, n, src, n));
+  uint8_t *out = (uint8_t *)dst;
+  const uint8_t *in = (const uint8_t *)src;
+  for (size_t i = 0; i < n; i++) {
+    out[i] ^= value_barrier_w(mask) & in[i];
+  }
+}
+
 #if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)

 // CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region