Mirror of BoringSSL (grpc依赖)
https://boringssl.googlesource.com/boringssl
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
403 lines
9.1 KiB
403 lines
9.1 KiB
# Copyright (c) 2018, Amazon Inc. |
|
# |
|
# Permission to use, copy, modify, and/or distribute this software for any |
|
# purpose with or without fee is hereby granted, provided that the above |
|
# copyright notice and this permission notice appear in all copies. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
|
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
|
# |
|
# Written by Nir Drucker, and Shay Gueron |
|
# AWS Cryptographic Algorithms Group |
|
# (ndrucker@amazon.com, gueron@amazon.com) |
|
# based on BN_mod_inverse_odd |
|
|
|
$flavour = shift; |
|
$output = shift; |
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
|
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
|
die "can't locate x86_64-xlate.pl"; |
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
|
*STDOUT=*OUT; |
|
|
|
############################################################################# |
|
# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], |
|
# BN_ULONG a[P256_LIMBS], |
|
# BN_ULONG n[P256_LIMBS]); |
|
# |
|
# (Binary Extended Euclidean Algorithm. |
|
# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm) |
|
# |
|
# Assumption 1: n is odd for the BEEU |
|
# Assumption 2: 1 < a < n < 2^256 |
|
|
|
$out = "%rdi"; |
|
$a = "%rsi"; |
|
$n = "%rdx"; |
|
|
|
# X/Y will hold the inverse parameter |
|
# Assumption: X,Y<2^(256) |
|
$x0 = "%r8"; |
|
$x1 = "%r9"; |
|
$x2 = "%r10"; |
|
$x3 = "%r11"; |
|
# borrow from out (out is needed only at the end) |
|
$x4 = "%rdi"; |
|
$y0 = "%r12"; |
|
$y1 = "%r13"; |
|
$y2 = "%r14"; |
|
$y3 = "%r15"; |
|
$y4 = "%rbp"; |
|
$shift = "%rcx"; |
|
$t0 = "%rax"; |
|
$t1 = "%rbx"; |
|
$t2 = "%rsi"; |
|
# borrow |
|
$t3 = "%rcx"; |
|
|
|
$T0 = "%xmm0"; |
|
$T1 = "%xmm1"; |
|
|
|
# Offsets on the stack |
|
$out_rsp = 0; |
|
$shift_rsp = $out_rsp+0x8; |
|
$a_rsp0 = $shift_rsp+0x8; |
|
$a_rsp1 = $a_rsp0+0x8; |
|
$a_rsp2 = $a_rsp1+0x8; |
|
$a_rsp3 = $a_rsp2+0x8; |
|
$b_rsp0 = $a_rsp3+0x8; |
|
$b_rsp1 = $b_rsp0+0x8; |
|
$b_rsp2 = $b_rsp1+0x8; |
|
$b_rsp3 = $b_rsp2+0x8; |
|
|
|
# Borrow when a_rsp/b_rsp are no longer needed. |
|
$y_rsp0 = $a_rsp0; |
|
$y_rsp1 = $y_rsp0+0x8; |
|
$y_rsp2 = $y_rsp1+0x8; |
|
$y_rsp3 = $y_rsp2+0x8; |
|
$y_rsp4 = $y_rsp3+0x8; |
|
$last_rsp_offset = $b_rsp3+0x8; |
|
|
|
sub TEST_B_ZERO { |
|
return <<___; |
|
xorq $t1, $t1 |
|
or $b_rsp0(%rsp), $t1 |
|
or $b_rsp1(%rsp), $t1 |
|
or $b_rsp2(%rsp), $t1 |
|
or $b_rsp3(%rsp), $t1 |
|
jz .Lbeeu_loop_end |
|
___ |
|
} |
|
|
|
$g_next_label = 0; |
|
|
|
sub SHIFT1 { |
|
my ($var0, $var1, $var2, $var3, $var4) = @_; |
|
my $label = ".Lshift1_${g_next_label}"; |
|
$g_next_label++; |
|
|
|
return <<___; |
|
# Ensure X is even and divide by two. |
|
movq \$1, $t1 |
|
andq $var0, $t1 |
|
jz $label |
|
add 0*8($n), $var0 |
|
adc 1*8($n), $var1 |
|
adc 2*8($n), $var2 |
|
adc 3*8($n), $var3 |
|
adc \$0, $var4 |
|
|
|
$label: |
|
shrdq \$1, $var1, $var0 |
|
shrdq \$1, $var2, $var1 |
|
shrdq \$1, $var3, $var2 |
|
shrdq \$1, $var4, $var3 |
|
shrq \$1, $var4 |
|
___ |
|
} |
|
|
|
sub SHIFT256 { |
|
my ($var) = @_; |
|
return <<___; |
|
# Copy shifted values. |
|
# Remember not to override t3=rcx |
|
movq 1*8+$var(%rsp), $t0 |
|
movq 2*8+$var(%rsp), $t1 |
|
movq 3*8+$var(%rsp), $t2 |
|
|
|
shrdq %cl, $t0, 0*8+$var(%rsp) |
|
shrdq %cl, $t1, 1*8+$var(%rsp) |
|
shrdq %cl, $t2, 2*8+$var(%rsp) |
|
|
|
shrq %cl, $t2 |
|
mov $t2, 3*8+$var(%rsp) |
|
___ |
|
} |
|
|
|
$code.=<<___; |
|
.text |
|
|
|
.type beeu_mod_inverse_vartime,\@function |
|
.hidden beeu_mod_inverse_vartime |
|
.globl beeu_mod_inverse_vartime |
|
.align 32 |
|
beeu_mod_inverse_vartime: |
|
.cfi_startproc |
|
push %rbp |
|
.cfi_push rbp |
|
push %r12 |
|
.cfi_push r12 |
|
push %r13 |
|
.cfi_push r13 |
|
push %r14 |
|
.cfi_push r14 |
|
push %r15 |
|
.cfi_push r15 |
|
push %rbx |
|
.cfi_push rbx |
|
push %rsi |
|
.cfi_push rsi |
|
|
|
sub \$$last_rsp_offset, %rsp |
|
.cfi_adjust_cfa_offset $last_rsp_offset |
|
movq $out, $out_rsp(%rsp) |
|
|
|
# X=1, Y=0 |
|
movq \$1, $x0 |
|
xorq $x1, $x1 |
|
xorq $x2, $x2 |
|
xorq $x3, $x3 |
|
xorq $x4, $x4 |
|
|
|
xorq $y0, $y0 |
|
xorq $y1, $y1 |
|
xorq $y2, $y2 |
|
xorq $y3, $y3 |
|
xorq $y4, $y4 |
|
|
|
# Copy a/n into B/A on the stack. |
|
vmovdqu 0*8($a), $T0 |
|
vmovdqu 2*8($a), $T1 |
|
vmovdqu $T0, $b_rsp0(%rsp) |
|
vmovdqu $T1, $b_rsp2(%rsp) |
|
|
|
vmovdqu 0*8($n), $T0 |
|
vmovdqu 2*8($n), $T1 |
|
vmovdqu $T0, $a_rsp0(%rsp) |
|
vmovdqu $T1, $a_rsp2(%rsp) |
|
|
|
.Lbeeu_loop: |
|
${\TEST_B_ZERO} |
|
|
|
# 0 < B < |n|, |
|
# 0 < A <= |n|, |
|
# (1) X*a == B (mod |n|), |
|
# (2) (-1)*Y*a == A (mod |n|) |
|
|
|
# Now divide B by the maximum possible power of two in the |
|
# integers, and divide X by the same value mod |n|. When we're |
|
# done, (1) still holds. |
|
movq \$1, $shift |
|
|
|
# Note that B > 0 |
|
.Lbeeu_shift_loop_XB: |
|
movq $shift, $t1 |
|
andq $b_rsp0(%rsp), $t1 |
|
jnz .Lbeeu_shift_loop_end_XB |
|
|
|
${\SHIFT1($x0, $x1, $x2, $x3, $x4)} |
|
shl \$1, $shift |
|
|
|
# Test wraparound of the shift parameter. The probability to have 32 zeroes |
|
# in a row is small Therefore having the value below equal \$0x8000000 or |
|
# \$0x8000 does not affect the performance. We choose 0x8000000 because it |
|
# is the maximal immediate value possible. |
|
cmp \$0x8000000, $shift |
|
jne .Lbeeu_shift_loop_XB |
|
|
|
.Lbeeu_shift_loop_end_XB: |
|
bsf $shift, $shift |
|
test $shift, $shift |
|
jz .Lbeeu_no_shift_XB |
|
|
|
${\SHIFT256($b_rsp0)} |
|
|
|
.Lbeeu_no_shift_XB: |
|
# Same for A and Y. Afterwards, (2) still holds. |
|
movq \$1, $shift |
|
|
|
# Note that A > 0 |
|
.Lbeeu_shift_loop_YA: |
|
movq $shift, $t1 |
|
andq $a_rsp0(%rsp), $t1 |
|
jnz .Lbeeu_shift_loop_end_YA |
|
|
|
${\SHIFT1($y0, $y1, $y2, $y3, $y4)} |
|
shl \$1, $shift |
|
|
|
# Test wraparound of the shift parameter. The probability to have 32 zeroes |
|
# in a row is small therefore having the value below equal \$0x8000000 or |
|
# \$0x8000 Does not affect the performance. We choose 0x8000000 because it |
|
# is the maximal immediate value possible. |
|
cmp \$0x8000000, $shift |
|
jne .Lbeeu_shift_loop_YA |
|
|
|
.Lbeeu_shift_loop_end_YA: |
|
bsf $shift, $shift |
|
test $shift, $shift |
|
jz .Lbeeu_no_shift_YA |
|
|
|
${\SHIFT256($a_rsp0)} |
|
|
|
.Lbeeu_no_shift_YA: |
|
# T = B-A (A,B < 2^256) |
|
mov $b_rsp0(%rsp), $t0 |
|
mov $b_rsp1(%rsp), $t1 |
|
mov $b_rsp2(%rsp), $t2 |
|
mov $b_rsp3(%rsp), $t3 |
|
sub $a_rsp0(%rsp), $t0 |
|
sbb $a_rsp1(%rsp), $t1 |
|
sbb $a_rsp2(%rsp), $t2 |
|
sbb $a_rsp3(%rsp), $t3 # borrow from shift |
|
jnc .Lbeeu_B_bigger_than_A |
|
|
|
# A = A - B |
|
mov $a_rsp0(%rsp), $t0 |
|
mov $a_rsp1(%rsp), $t1 |
|
mov $a_rsp2(%rsp), $t2 |
|
mov $a_rsp3(%rsp), $t3 |
|
sub $b_rsp0(%rsp), $t0 |
|
sbb $b_rsp1(%rsp), $t1 |
|
sbb $b_rsp2(%rsp), $t2 |
|
sbb $b_rsp3(%rsp), $t3 |
|
mov $t0, $a_rsp0(%rsp) |
|
mov $t1, $a_rsp1(%rsp) |
|
mov $t2, $a_rsp2(%rsp) |
|
mov $t3, $a_rsp3(%rsp) |
|
|
|
# Y = Y + X |
|
add $x0, $y0 |
|
adc $x1, $y1 |
|
adc $x2, $y2 |
|
adc $x3, $y3 |
|
adc $x4, $y4 |
|
jmp .Lbeeu_loop |
|
|
|
.Lbeeu_B_bigger_than_A: |
|
# B = T = B - A |
|
mov $t0, $b_rsp0(%rsp) |
|
mov $t1, $b_rsp1(%rsp) |
|
mov $t2, $b_rsp2(%rsp) |
|
mov $t3, $b_rsp3(%rsp) |
|
|
|
# X = Y + X |
|
add $y0, $x0 |
|
adc $y1, $x1 |
|
adc $y2, $x2 |
|
adc $y3, $x3 |
|
adc $y4, $x4 |
|
|
|
jmp .Lbeeu_loop |
|
|
|
.Lbeeu_loop_end: |
|
# The Euclid's algorithm loop ends when A == beeu(a,n); |
|
# Therefore (-1)*Y*a == A (mod |n|), Y>0 |
|
|
|
# Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) |
|
mov $a_rsp0(%rsp), $t1 |
|
sub \$1, $t1 |
|
or $a_rsp1(%rsp), $t1 |
|
or $a_rsp2(%rsp), $t1 |
|
or $a_rsp3(%rsp), $t1 |
|
# If not, fail. |
|
jnz .Lbeeu_err |
|
|
|
# From this point on, we no longer need X |
|
# Therefore we use it as a temporary storage. |
|
# X = n |
|
movq 0*8($n), $x0 |
|
movq 1*8($n), $x1 |
|
movq 2*8($n), $x2 |
|
movq 3*8($n), $x3 |
|
xorq $x4, $x4 |
|
|
|
.Lbeeu_reduction_loop: |
|
movq $y0, $y_rsp0(%rsp) |
|
movq $y1, $y_rsp1(%rsp) |
|
movq $y2, $y_rsp2(%rsp) |
|
movq $y3, $y_rsp3(%rsp) |
|
movq $y4, $y_rsp4(%rsp) |
|
|
|
# If Y>n ==> Y=Y-n |
|
sub $x0, $y0 |
|
sbb $x1, $y1 |
|
sbb $x2, $y2 |
|
sbb $x3, $y3 |
|
sbb \$0, $y4 |
|
|
|
# Choose old Y or new Y |
|
cmovc $y_rsp0(%rsp), $y0 |
|
cmovc $y_rsp1(%rsp), $y1 |
|
cmovc $y_rsp2(%rsp), $y2 |
|
cmovc $y_rsp3(%rsp), $y3 |
|
jnc .Lbeeu_reduction_loop |
|
|
|
# X = n - Y (n, Y < 2^256), (Cancel the (-1)) |
|
sub $y0, $x0 |
|
sbb $y1, $x1 |
|
sbb $y2, $x2 |
|
sbb $y3, $x3 |
|
|
|
.Lbeeu_save: |
|
# Save the inverse(<2^256) to out. |
|
mov $out_rsp(%rsp), $out |
|
|
|
movq $x0, 0*8($out) |
|
movq $x1, 1*8($out) |
|
movq $x2, 2*8($out) |
|
movq $x3, 3*8($out) |
|
|
|
# Return 1. |
|
movq \$1, %rax |
|
jmp .Lbeeu_finish |
|
|
|
.Lbeeu_err: |
|
# Return 0. |
|
xorq %rax, %rax |
|
|
|
.Lbeeu_finish: |
|
add \$$last_rsp_offset, %rsp |
|
.cfi_adjust_cfa_offset -$last_rsp_offset |
|
pop %rsi |
|
.cfi_pop rsi |
|
pop %rbx |
|
.cfi_pop rbx |
|
pop %r15 |
|
.cfi_pop r15 |
|
pop %r14 |
|
.cfi_pop r14 |
|
pop %r13 |
|
.cfi_pop r13 |
|
pop %r12 |
|
.cfi_pop r12 |
|
pop %rbp |
|
.cfi_pop rbp |
|
ret |
|
.cfi_endproc |
|
|
|
.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime |
|
___ |
|
|
|
print $code; |
|
close STDOUT or die "error closing STDOUT";
|
|
|