Use packed representation for large Curve25519 table

Did 59000 Ed25519 key generation operations in 1004188us (58753.9 ops/sec) [+8.3%]
Did 57000 Ed25519 signing operations in 1005649us (56679.8 ops/sec) [+7.9%]
Did 19000 Ed25519 verify operations in 1054380us (18020.1 ops/sec) [-2.0%]
Did 61000 Curve25519 base-point multiplication operations in 1007401us (60551.9 ops/sec) [+8.3%]
Did 22000 Curve25519 arbitrary point multiplication operations in 1022882us (21507.9 ops/sec) [+0.5%]

Change-Id: I14668f658b1ae99850cb0f8938f90f988d0edd0b
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/60107
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
chromium-stable
Andres Erbsen 2 years ago committed by Boringssl LUCI CQ
parent 4a0393fcf3
commit d605df5b6f
  1. 69
      crypto/constant_time_test.cc
  2. 70
      crypto/curve25519/curve25519.c
  3. 10226
      crypto/curve25519/curve25519_tables.h
  4. 22
      crypto/curve25519/make_curve25519_tables.py
  5. 54
      crypto/internal.h

@ -52,6 +52,7 @@
#include <limits>
#include <gtest/gtest.h>
#include "test/test_util.h"
#include <openssl/mem.h>
#include <openssl/rand.h>
@ -169,3 +170,71 @@ TEST(ConstantTimeTest, ValueBarrier) {
EXPECT_EQ(u64, value_barrier_u64(u64));
}
}
TEST(ConstantTimeTest, MemCmov) {
for (int i = 0; i < 100; i++) {
uint8_t out[256], in[256];
RAND_bytes(out, sizeof(out));
RAND_bytes(in, sizeof(in));
uint8_t b = 0;
RAND_bytes(&b, 1);
b = constant_time_is_zero_8(b & 0xf);
uint8_t ref_in[256];
OPENSSL_memcpy(ref_in, in, sizeof(in));
uint8_t ref_out[256];
OPENSSL_memcpy(ref_out, out, sizeof(out));
if (b) {
OPENSSL_memcpy(ref_out, in, sizeof(in));
}
CONSTTIME_SECRET(out, sizeof(out));
CONSTTIME_SECRET(in, sizeof(in));
CONSTTIME_SECRET(&b, 1);
constant_time_conditional_memcpy(out, in, sizeof(out), b);
CONSTTIME_DECLASSIFY(&in, sizeof(in));
CONSTTIME_DECLASSIFY(&out, sizeof(out));
EXPECT_EQ(Bytes(in), Bytes(ref_in));
EXPECT_EQ(Bytes(out), Bytes(ref_out));
}
}
TEST(ConstantTimeTest, MemCxor) {
for (int i = 0; i < 100; i++) {
uint8_t out[256], in[256];
RAND_bytes(out, sizeof(out));
RAND_bytes(in, sizeof(in));
uint8_t b = 0;
RAND_bytes(&b, 1);
b = constant_time_is_zero_8(b & 0xf);
uint8_t ref_in[256];
OPENSSL_memcpy(ref_in, in, sizeof(in));
uint8_t ref_out[256];
OPENSSL_memcpy(ref_out, out, sizeof(out));
if (b) {
for (size_t j = 0; j < sizeof(ref_out); ++j) {
ref_out[j] ^= in[j];
}
}
CONSTTIME_SECRET(out, sizeof(out));
CONSTTIME_SECRET(in, sizeof(in));
CONSTTIME_SECRET(&b, 1);
constant_time_conditional_memxor(out, in, sizeof(out), b);
CONSTTIME_DECLASSIFY(&in, sizeof(in));
CONSTTIME_DECLASSIFY(&out, sizeof(out));
EXPECT_EQ(Bytes(in), Bytes(ref_in));
EXPECT_EQ(Bytes(out), Bytes(ref_out));
}
}

@ -315,11 +315,6 @@ static void fe_copy_lt(fe_loose *h, const fe *f) {
static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
OPENSSL_memmove(h, f, sizeof(fe));
}
#if !defined(OPENSSL_SMALL)
static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
OPENSSL_memmove(h, f, sizeof(fe_loose));
}
#endif // !defined(OPENSSL_SMALL)
static void fe_loose_invert(fe *out, const fe_loose *z) {
fe t0;
@ -698,16 +693,6 @@ void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe_add(&r->T, &trZ, &trT);
}
static uint8_t equal(signed char b, signed char c) {
uint8_t ub = b;
uint8_t uc = c;
uint8_t x = ub ^ uc; // 0: yes; 1..255: no
uint32_t y = x; // 0: yes; 1..255: no
y -= 1; // 4294967295: yes; 0..254: no
y >>= 31; // 1: yes; 0: no
return y;
}
static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
fe_cmov(&t->yplusx, &u->yplusx, b);
fe_cmov(&t->yminusx, &u->yminusx, b);
@ -754,7 +739,7 @@ void x25519_ge_scalarmult_small_precomp(
ge_precomp_0(&e);
for (j = 1; j < 16; j++) {
cmov(&e, &multiples[j-1], equal(index, j));
cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
}
ge_cached cached;
@ -776,35 +761,36 @@ void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
#else
static uint8_t negative(signed char b) {
uint32_t x = b;
x >>= 31; // 1: yes; 0: no
return x;
}
static void table_select(ge_precomp *t, const int pos, const signed char b) {
uint8_t bnegative = constant_time_msb_w(b);
uint8_t babs = b - ((bnegative & b) << 1);
static void table_select(ge_precomp *t, int pos, signed char b) {
ge_precomp minust;
uint8_t bnegative = negative(b);
uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
uint8_t t_bytes[3][32] = {
{constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
#if defined(__clang__) // materialize for vectorization, 6% speedup
__asm__("" : "+m" (t_bytes) : /*no inputs*/);
#endif
static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
for (int i = 0; i < 8; i++) {
constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
sizeof(t_bytes),
constant_time_eq_w(babs, 1 + i));
}
ge_precomp_0(t);
cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
fe_copy_ll(&minust.yplusx, &t->yminusx);
fe_copy_ll(&minust.yminusx, &t->yplusx);
fe yplusx, yminusx, xy2d;
fe_frombytes_strict(&yplusx, t_bytes[0]);
fe_frombytes_strict(&yminusx, t_bytes[1]);
fe_frombytes_strict(&xy2d, t_bytes[2]);
// NOTE: the input table is canonical, but types don't encode it
fe tmp;
fe_carry(&tmp, &t->xy2d);
fe_neg(&minust.xy2d, &tmp);
fe_copy_lt(&t->yplusx, &yplusx);
fe_copy_lt(&t->yminusx, &yminusx);
fe_copy_lt(&t->xy2d, &xy2d);
cmov(t, &minust, bnegative);
ge_precomp minust;
fe_copy_lt(&minust.yplusx, &yminusx);
fe_copy_lt(&minust.yminusx, &yplusx);
fe_neg(&minust.xy2d, &xy2d);
cmov(t, &minust, bnegative>>7);
}
// h = a * B
@ -916,7 +902,7 @@ void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
ge_cached selected;
ge_cached_0(&selected);
for (j = 0; j < 16; j++) {
cmov_cached(&selected, &Ai[j], equal(j, index));
cmov_cached(&selected, &Ai[j], 1&constant_time_eq_w(index, j));
}
x25519_ge_add(&t, &u, &selected);

File diff suppressed because it is too large Load Diff

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# coding=utf-8
# Copyright (c) 2020, Google Inc.
#
@ -14,7 +14,7 @@
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
import StringIO
from io import StringIO
import subprocess
# Base field Z_p
@ -76,12 +76,7 @@ def point_mul(s, P):
return Q
def to_bytes(x):
ret = bytearray(32)
for i in range(len(ret)):
ret[i] = x % 256
x >>= 8
assert x == 0
return ret
return x.to_bytes(32, "little")
def to_ge_precomp(P):
# typedef struct {
@ -109,6 +104,9 @@ def to_base_51(x):
assert x == 0
return ret
def to_bytes_literal(x):
return "{" + ", ".join(map(hex, to_bytes(x))) + "}"
def to_literal(x):
ret = "{{\n#if defined(OPENSSL_64_BIT)\n"
ret += ", ".join(map(str, to_base_51(x)))
@ -140,7 +138,7 @@ def main():
bi_precomp.append(to_ge_precomp(P))
buf = StringIO.StringIO()
buf = StringIO()
buf.write("""/* Copyright (c) 2020, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
@ -190,14 +188,14 @@ static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""")
#else
// k25519Precomp[i][j] = (j+1)*256^i*B
static const ge_precomp k25519Precomp[32][8] = {
static const uint8_t k25519Precomp[32][8][3][32] = {
""")
for child in large_precomp:
buf.write("{\n")
for val in child:
buf.write("{\n")
for term in val:
buf.write(to_literal(term) + ",\n")
buf.write(to_bytes_literal(term) + ",\n")
buf.write("},\n")
buf.write("},\n")
buf.write("""};
@ -216,7 +214,7 @@ static const ge_precomp Bi[8] = {
""")
proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE)
proc.communicate(buf.getvalue())
proc.communicate(buf.getvalue().encode("utf8"))
if __name__ == "__main__":
main()

@ -265,15 +265,15 @@ OPENSSL_INLINE void OPENSSL_reset_malloc_counter_for_testing(void) {}
// Pointer utility functions.
// buffers_alias returns one if |a| and |b| alias and zero otherwise.
static inline int buffers_alias(const uint8_t *a, size_t a_len,
const uint8_t *b, size_t b_len) {
static inline int buffers_alias(const void *a, size_t a_bytes,
const void *b, size_t b_bytes) {
// Cast |a| and |b| to integers. In C, pointer comparisons between unrelated
// objects are undefined whereas pointer to integer conversions are merely
// implementation-defined. We assume the implementation defined it in a sane
// way.
uintptr_t a_u = (uintptr_t)a;
uintptr_t b_u = (uintptr_t)b;
return a_u + a_len > b_u && b_u + b_len > a_u;
return a_u + a_bytes > b_u && b_u + b_bytes > a_u;
}
// align_pointer returns |ptr|, advanced to |alignment|. |alignment| must be a
@ -360,6 +360,9 @@ static inline uint64_t value_barrier_u64(uint64_t a) {
return a;
}
// |value_barrier_u8| could be defined as above, but compilers other than
// clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM.
// constant_time_msb_w returns the given value with the MSB copied to all the
// other bits.
static inline crypto_word_t constant_time_msb_w(crypto_word_t a) {
@ -476,16 +479,23 @@ static inline crypto_word_t constant_time_select_w(crypto_word_t mask,
// to a cmov, it sometimes further transforms it into a branch, which we do
// not want.
//
// Adding barriers to both |mask| and |~mask| breaks the relationship between
// the two, which makes the compiler stick with bitmasks.
return (value_barrier_w(mask) & a) | (value_barrier_w(~mask) & b);
// Hiding the value of the mask from the compiler evades this transformation.
mask = value_barrier_w(mask);
return (mask & a) | (~mask & b);
}
// constant_time_select_8 acts like |constant_time_select| but operates on
// 8-bit values.
static inline uint8_t constant_time_select_8(uint8_t mask, uint8_t a,
static inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a,
uint8_t b) {
return (uint8_t)(constant_time_select_w(mask, a, b));
// |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM
// Making both |mask| and its value barrier |uint8_t| would allow the compiler
// to materialize 0x????..?MM instead, but only clang is that clever.
// However, vectorization of bitwise operations seems to work better on
// |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to
// |uint8_t| after the value barrier but before the bitwise operations.
uint8_t m = value_barrier_w(mask);
return (m & a) | (~m & b);
}
// constant_time_select_int acts like |constant_time_select| but operates on
@ -495,6 +505,34 @@ static inline int constant_time_select_int(crypto_word_t mask, int a, int b) {
(crypto_word_t)(b)));
}
// constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if
// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
static inline void constant_time_conditional_memcpy(void *dst, const void *src,
const size_t n,
const crypto_word_t mask) {
assert(!buffers_alias(dst, n, src, n));
uint8_t *out = (uint8_t *)dst;
const uint8_t *in = (const uint8_t *)src;
for (size_t i = 0; i < n; i++) {
out[i] = constant_time_select_8(mask, in[i], out[i]);
}
}
// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if
// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
// ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
static inline void constant_time_conditional_memxor(void *dst, const void *src,
const size_t n,
const crypto_word_t mask) {
assert(!buffers_alias(dst, n, src, n));
uint8_t *out = (uint8_t *)dst;
const uint8_t *in = (const uint8_t *)src;
for (size_t i = 0; i < n; i++) {
out[i] ^= value_barrier_w(mask) & in[i];
}
}
#if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)
// CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region

Loading…
Cancel
Save