P-256 assembly optimisations for Aarch64.

The ARMv8 assembly code in this commit is mostly taken from OpenSSL's `ecp_nistz256-armv8.pl` at 19e277dd19/crypto/ec/asm/ecp_nistz256-armv8.pl (see Note 1), adapting it to the implementation in p256-x86_64.c. Most of the assembly functions found in `crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl` required to support that code have their analogous functions in the imported OpenSSL ARMv8 Perl assembly implementation with the exception of the functions: - ecp_nistz256_select_w5 - ecp_nistz256_select_w7 An implementation for these functions was added. Summary of modifications to the imported code: * Renamed to `p256-armv8-asm.pl` * Modified the location of `arm-xlate.pl` and `arm_arch.h` * Replaced the `scatter-gather subroutines` with `select subroutines`. The `select subroutines` are implemented for ARMv8 similarly to their x86_64 counterparts, `ecp_nistz256_select_w5` and `ecp_nistz256_select_w7`. * `ecp_nistz256_add` is removed because it was conflicting during the static build with the function of the same name in p256-nistz.c. The latter calls another assembly function, `ecp_nistz256_point_add`. * `__ecp_nistz256_add` renamed to `__ecp_nistz256_add_to` to avoid the conflict with the function `ecp_nistz256_add` during the static build. * l. 924 `add sp,sp,#256` the calculation of the constant, 32*(12-4), is not left for the assembler to perform. Other modifications: * `beeu_mod_inverse_vartime()` was implemented for AArch64 in `p256_beeu-armv8-asm.pl` similarly to its implementation in `p256_beeu-x86_64-asm.pl`. * The files containing `p256-x86_64` in their name were renamed to, `p256-nistz` since the functions and tests defined in them are hereby running on ARMv8 as well, if enabled. * Updated `delocate.go` and `delocate.peg` to handle the offset calculation in the assembly instructions. * Regenerated `delocate.peg.go`. Notes: 1- The last commit in the history of the file is in master only, the previous commits are in OpenSSL 3.0.1 2- This change focuses on AArch64 (64-bit architecture of ARMv8). It does not support ARMv4 or ARMv7. Testing the performance on Armv8 platform using -DCMAKE_BUILD_TYPE=Release: Before: ``` Did 2596 ECDH P-256 operations in 1093956us (2373.0 ops/sec) Did 6996 ECDSA P-256 signing operations in 1044630us (6697.1 ops/sec) Did 2970 ECDSA P-256 verify operations in 1084848us (2737.7 ops/sec) ``` After: ``` Did 6699 ECDH P-256 operations in 1091684us (6136.4 ops/sec) Did 20000 ECDSA P-256 signing operations in 1012944us (19744.4 ops/sec) Did 7051 ECDSA P-256 verify operations in 1060000us (6651.9 ops/sec) ``` Change-Id: I9fdef12db365967a9264b5b32c07967b55ea48bd Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51805 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
3 years ago · fa3fbda07b
parent f7e1a94bd9
commit fa3fbda07b
17 changed files with 2651 additions and 359 deletions
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@ -518,7 +518,7 @@ add_executable(
  fipsmodule/aes/aes_test.cc
  fipsmodule/bn/bn_test.cc
  fipsmodule/ec/ec_test.cc
-  fipsmodule/ec/p256-x86_64_test.cc
+  fipsmodule/ec/p256-nistz_test.cc
  fipsmodule/ecdsa/ecdsa_test.cc
  fipsmodule/md5/md5_test.cc
  fipsmodule/modes/gcm_test.cc
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@ -64,6 +64,8 @@ if(ARCH STREQUAL "aarch64")
    armv8-mont.${ASM_EXT}
    ghash-neon-armv8.${ASM_EXT}
    ghashv8-armx.${ASM_EXT}
+    p256-armv8-asm.${ASM_EXT}
+    p256_beeu-armv8-asm.${ASM_EXT}
    sha1-armv8.${ASM_EXT}
    sha256-armv8.${ASM_EXT}
    sha512-armv8.${ASM_EXT}
@ -102,6 +104,8 @@ perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
 perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
 perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
 perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
+perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl)
+perlasm(p256_beeu-armv8-asm.${ASM_EXT} ec/asm/p256_beeu-armv8-asm.pl)
 perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl)
 perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl)
 perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@ -71,7 +71,7 @@
 #include "ec/oct.c"
 #include "ec/p224-64.c"
 #include "ec/p256.c"
-#include "ec/p256-x86_64.c"
+#include "ec/p256-nistz.c"
 #include "ec/scalar.c"
 #include "ec/simple.c"
 #include "ec/simple_mul.c"
--- a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
--- a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
@ -0,0 +1,455 @@
+# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+#
+# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd).
+#
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+#                                     BN_ULONG a[P256_LIMBS],
+#                                     BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended GCD (Euclidean) Algorithm.
+#  See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography,
+#  Chapter 14, Algorithm 14.61 and Note 14.64
+#  http://cacr.uwaterloo.ca/hac/about/chap14.pdf)
+
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+# Details
+# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse.
+# In other words,
+# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“)
+#  a == x^{-1} (mod y)
+#
+# It can be shown that throughout all the iterations of the algorithm, the following holds:
+#    u = Ax + By
+#    v = Cx + Dy
+# The values B and D are not of interest in this case, so they need not be computed by the algorithm.
+# This means the following congruences hold through the iterations of the algorithm.
+#    Ax == u (mod y)
+#    Cx == v (mod y)
+
+# Now we will modify the notation to match that of BN_mod_inverse_odd()
+# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based.
+# In those functions:
+#    x, y -> a, n
+#    u, v -> B, A
+#    A, C -> X, Y’, where Y’ = -Y
+# Hence, the following holds throughout the algorithm iterations
+#    Xa == B (mod n)
+#   -Ya == A (mod n)
+#
+# Same algorithm in Python:
+# def beeu(a, n):
+#     X = 1
+#     Y = 0
+#     B = a
+#     A = n
+#     while (B != 0):
+#         while (B % 2) == 0:
+#             B >>= 1
+#             if (X % 2) == 1:
+#                 X = X + n
+#             X >>= 1
+#         while (A % 2) == 0:
+#             A >>= 1
+#             if (Y % 2) == 1:
+#                 Y = Y + n
+#             Y >>= 1
+#         if (B >= A):
+#             B = B - A
+#             X = X + Y
+#         else:
+#             A = A - B
+#             Y = Y + X
+#     if (A != 1):
+#         # error
+#         return 0
+#     else:
+#         while (Y > n):
+#             Y = Y - n
+#         Y = n - Y
+#         return Y
+
+
+# For the internal variables,
+# x0-x2, x30 are used to hold the modulus n. The input parameters passed in
+# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack.
+# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible
+# x8 is corruptible here
+# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address
+#  for that struct).
+# x9-x15 are corruptible registers
+# x19-x28 are callee-saved registers
+
+# X/Y will hold the inverse parameter
+# Assumption: a,n,X,Y < 2^(256)
+# Initially, X := 1, Y := 0
+#            A := n, B := a
+
+# Function parameters (as per the Procedure Call Standard)
+my($out, $a_in, $n_in)=map("x$_",(0..2));
+# Internal variables
+my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30));
+my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7));
+my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12));
+my($shift)=("x13");
+my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20));
+my($a0, $a1, $a2, $a3)=map("x$_",(21..24));
+my($b0, $b1, $b2, $b3)=map("x$_",(25..28));
+
+# if B == 0, jump to end of loop
+sub TEST_B_ZERO {
+  return <<___;
+    orr     $t0, $b0, $b1
+    orr     $t0, $t0, $b2
+
+    // reverse the bit order of $b0. This is needed for clz after this macro
+    rbit     $t1, $b0
+
+    orr     $t0, $t0, $b3
+    cbz     $t0,.Lbeeu_loop_end
+___
+}
+
+# Shift right by 1 bit, adding the modulus first if the variable is odd
+# if least_sig_bit(var0) == 0,
+#     goto shift1_<ctr>
+# else
+#     add n and goto shift1_<ctr>
+# Prerequisite: t0 = 0
+$g_next_label = 0;
+sub SHIFT1 {
+  my ($var0, $var1, $var2, $var3, $var4) = @_;
+  my $label = ".Lshift1_${g_next_label}";
+  $g_next_label++;
+  return <<___;
+    tbz     $var0, #0, $label
+    adds    $var0, $var0, $n0
+    adcs    $var1, $var1, $n1
+    adcs    $var2, $var2, $n2
+    adcs    $var3, $var3, $n3
+    adc     $var4, $var4, $t0
+$label:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+    extr    $var0, $var1, $var0, #1
+    extr    $var1, $var2, $var1, #1
+    extr    $var2, $var3, $var2, #1
+    extr    $var3, $var4, $var3, #1
+    lsr     $var4, $var4, #1
+___
+}
+
+# compilation by clang 10.0.0 with -O2/-O3 of
+#      a[0] = (a[0] >> count) | (a[1] << (64-count));
+#      a[1] = (a[1] >> count) | (a[2] << (64-count));
+#      a[2] = (a[2] >> count) | (a[3] << (64-count));
+#      a[3] >>= count;
+# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ
+# except that the second source operand of EXTR is only immediate;
+# that's why it cannot be used here where $shift is a variable
+#
+# In the following,
+# t0 := 0 - shift
+#
+# then var0, for example, will be shifted right as follows:
+# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64))
+# "uint() mod 64" is from the definition of LSL and LSR instructions.
+#
+# What matters here is the order of instructions relative to certain other
+# instructions, i.e.
+# - lsr and lsl must precede orr of the corresponding registers.
+# - lsl must preced the lsr of the same register afterwards.
+# The chosen order of the instructions overall is to try and maximize
+# the pipeline usage.
+sub SHIFT256 {
+  my ($var0, $var1, $var2, $var3) = @_;
+  return <<___;
+    neg $t0, $shift
+    lsr $var0, $var0, $shift
+    lsl $t1, $var1, $t0
+
+    lsr $var1, $var1, $shift
+    lsl $t2, $var2, $t0
+
+    orr $var0, $var0, $t1
+
+    lsr $var2, $var2, $shift
+    lsl $t3, $var3, $t0
+
+    orr $var1, $var1, $t2
+
+    lsr $var3, $var3, $shift
+
+    orr $var2, $var2, $t3
+___
+}
+
+$code.=<<___;
+#include "openssl/arm_arch.h"
+
+.text
+.globl  beeu_mod_inverse_vartime
+.type   beeu_mod_inverse_vartime, %function
+.align  4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+    AARCH64_SIGN_LINK_REGISTER
+    stp     x29,x30,[sp,#-112]!
+    add     x29,sp,#0
+    stp     x19,x20,[sp,#16]
+    stp     x21,x22,[sp,#32]
+    stp     x23,x24,[sp,#48]
+    stp     x25,x26,[sp,#64]
+    stp     x27,x28,[sp,#80]
+    stp     x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+    ldp     $b0,$b1,[$a_in]
+    ldp     $b2,$b3,[$a_in,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+    ldp     $n0,$n1,[$n_in]
+    ldp     $n2,$n3,[$n_in,#16]
+
+    // A = a3..a0 := n
+    mov     $a0, $n0
+    mov     $a1, $n1
+    mov     $a2, $n2
+    mov     $a3, $n3
+
+    // X = x4..x0 := 1
+    mov     $x0, #1
+    eor     $x1, $x1, $x1
+    eor     $x2, $x2, $x2
+    eor     $x3, $x3, $x3
+    eor     $x4, $x4, $x4
+
+    // Y = y4..y0 := 0
+    eor     $y0, $y0, $y0
+    eor     $y1, $y1, $y1
+    eor     $y2, $y2, $y2
+    eor     $y3, $y3, $y3
+    eor     $y4, $y4, $y4
+
+.Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+    ${\TEST_B_ZERO}
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in $b0
+    // (      = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO)
+    clz     $shift, $t1
+
+    // If there is no shift, goto shift_A_Y
+    cbz     $shift, .Lbeeu_shift_A_Y
+
+    // Shift B right by "$shift" bits
+    ${\SHIFT256($b0, $b1, $b2, $b3)}
+
+    // Shift X right by "$shift" bits, adding n whenever X becomes odd.
+    // $shift--;
+    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+    eor     $t0, $t0, $t0
+.Lbeeu_shift_loop_X:
+    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+    subs    $shift, $shift, #1
+    bne     .Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "$shift" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and $shift is decreased while executing the X loop.
+    // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of $a0
+    // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1)
+    rbit    $t1, $a0
+    clz     $shift, $t1
+
+    // If there is no shift, goto |B-A|, X+Y update
+    cbz     $shift, .Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "$shift" bits
+    ${\SHIFT256($a0, $a1, $a2, $a3)}
+
+    // Shift Y right by "$shift" bits, adding n whenever Y becomes odd.
+    // $shift--;
+    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+    eor     $t0, $t0, $t0
+.Lbeeu_shift_loop_Y:
+    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+    subs    $shift, $shift, #1
+    bne     .Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+    subs    $t0, $b0, $a0
+    sbcs    $t1, $b1, $a1
+    sbcs    $t2, $b2, $a2
+    sbcs    $t3, $b3, $a3
+    bcs     .Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+    subs    $a0, $a0, $b0
+    sbcs    $a1, $a1, $b1
+    sbcs    $a2, $a2, $b2
+    sbcs    $a3, $a3, $b3
+
+    adds    $y0, $y0, $x0
+    adcs    $y1, $y1, $x1
+    adcs    $y2, $y2, $x2
+    adcs    $y3, $y3, $x3
+    adc     $y4, $y4, $x4
+    b       .Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+    mov     $b0, $t0
+    mov     $b1, $t1
+    mov     $b2, $t2
+    mov     $b3, $t3
+
+    adds    $x0, $x0, $y0
+    adcs    $x1, $x1, $y1
+    adcs    $x2, $x2, $y2
+    adcs    $x3, $x3, $y3
+    adc     $x4, $x4, $y4
+    b       .Lbeeu_loop
+
+.Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+    sub     $t0, $a0, #1
+    orr     $t0, $t0, $a1
+    orr     $t0, $t0, $a2
+    orr     $t0, $t0, $a3
+    cbnz    $t0, .Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // ($t0 = 0 from above)
+    subs    $x0, $y0, $n0
+    sbcs    $x1, $y1, $n1
+    sbcs    $x2, $y2, $n2
+    sbcs    $x3, $y3, $n3
+    sbcs    $x4, $y4, $t0
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+    csel    $y0, $x0, $y0, cs
+    csel    $y1, $x1, $y1, cs
+    csel    $y2, $x2, $y2, cs
+    csel    $y3, $x3, $y3, cs
+    csel    $y4, $x4, $y4, cs
+    bcs     .Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+    subs    $y0, $n0, $y0
+    sbcs    $y1, $n1, $y1
+    sbcs    $y2, $n2, $y2
+    sbcs    $y3, $n3, $y3
+
+    // Save Y in output (out (x0) was saved on the stack)
+    ldr     x3, [sp,#96]
+    stp     $y0, $y1, [x3]
+    stp     $y2, $y3, [x3,#16]
+    // return 1 (success)
+    mov     x0, #1
+    b       .Lbeeu_finish
+
+.Lbeeu_err:
+    // return 0 (error)
+    eor     x0, x0, x0
+
+.Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+    add     sp,x29,#0
+    ldp     x19,x20,[sp,#16]
+    ldp     x21,x22,[sp,#32]
+    ldp     x23,x24,[sp,#48]
+    ldp     x25,x26,[sp,#64]
+    ldp     x27,x28,[sp,#80]
+    ldp     x29,x30,[sp],#112
+
+    AARCH64_VALIDATE_LINK_REGISTER
+    ret
+.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+___
+
+
+foreach (split("\n",$code)) {
+    s/\`([^\`]*)\`/eval $1/ge;
+
+    print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@ -246,7 +246,8 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  out->curves[2].param_len = 32;
  out->curves[2].params = kP256Params;
  out->curves[2].method =
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
    !defined(OPENSSL_SMALL)
      EC_GFp_nistz256_method();
 #else
--- a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go
+++ b/crypto/fipsmodule/ec/make_p256-x86_64-tests.go
@ -69,7 +69,7 @@ func fromMontgomery(z, x *big.Int) *big.Int {

 func isAffineInfinity(x, y *big.Int) bool {
 	// Infinity, in affine coordinates, is represented as (0, 0) by
-	// both Go and p256-x86_64-asm.pl.
+	// both Go, p256-x86_64-asm.pl and p256-armv8-asm.pl.
 	return x.Sign() == 0 && y.Sign() == 0
 }

@ -107,8 +107,8 @@ func toJacobian(xIn, yIn *big.Int) (x, y, z *big.Int) {
 		// arbitrary X and Y and include the special case. We also have
 		// not verified that add and double preserve this
 		// property. Thus, generate test vectors with unrelated X and Y,
-		// to test that p256-x86_64-asm.pl correctly handles
-		// unconstrained representations of infinity.
+		// to test that p256-x86_64-asm.pl and p256-armv8-asm.pl correctly
+		// handle unconstrained representations of infinity.
 		x = randNonZeroInt(p)
 		y = randNonZeroInt(p)
 		z = zero
--- a/crypto/fipsmodule/ec/make_tables.go
+++ b/crypto/fipsmodule/ec/make_tables.go
@ -23,8 +23,8 @@ import (
 )

 func main() {
-	if err := writeP256X86_64Table("p256-x86_64-table.h"); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing p256-x86_64-table.h: %s\n", err)
+	if err := writeP256NistzTable("p256-nistz-table.h"); err != nil {
+		fmt.Fprintf(os.Stderr, "Error writing p256-nistz-table.h: %s\n", err)
 		os.Exit(1)
 	}

@ -34,7 +34,7 @@ func main() {
 	}
 }

-func writeP256X86_64Table(path string) error {
+func writeP256NistzTable(path string) error {
 	curve := elliptic.P256()
 	tables := make([][][2]*big.Int, 0, 37)
 	for shift := 0; shift < 256; shift += 7 {
@ -59,7 +59,7 @@ func writeP256X86_64Table(path string) error {
 */

 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
--- a/crypto/fipsmodule/ec/p256-x86_64-table.h
+++ b/crypto/fipsmodule/ec/p256-x86_64-table.h
@ -9,7 +9,7 @@
 */

 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@ -30,10 +30,10 @@
 #include "../delocate.h"
 #include "../../internal.h"
 #include "internal.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"

-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) &&  \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&    \
    !defined(OPENSSL_SMALL)

 typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
@ -45,7 +45,7 @@ static const BN_ULONG ONE[P256_LIMBS] = {
 };

 // Precomputed tables for the default generator
-#include "p256-x86_64-table.h"
+#include "p256-nistz-table.h"

 // Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
 // util.c for details
@ -554,10 +554,12 @@ static void ecp_nistz256_inv0_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
 static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
                                                 EC_SCALAR *out,
                                                 const EC_SCALAR *in) {
+#if defined(OPENSSL_X86_64)
  if (!CRYPTO_is_AVX_capable()) {
    // No AVX support; fallback to generic code.
    return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
  }
+#endif

  assert(group->order.width == P256_LIMBS);
  if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) {
@ -628,5 +630,6 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
  out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
 }

-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#endif /* !defined(OPENSSL_NO_ASM) && \
+          (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&  \
          !defined(OPENSSL_SMALL) */
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-x86_64.h
@ -30,7 +30,8 @@ extern "C" {
 #endif


-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
    !defined(OPENSSL_SMALL)

 // P-256 field operations.
@ -142,7 +143,8 @@ void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
 void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
                                   const P256_POINT_AFFINE *b);

-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#endif /* !defined(OPENSSL_NO_ASM) && \
+          (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
          !defined(OPENSSL_SMALL) */


--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@ -30,15 +30,16 @@
 #include "../../test/abi_test.h"
 #include "../../test/file_test.h"
 #include "../../test/test_util.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"


 // Disable tests if BORINGSSL_SHARED_LIBRARY is defined. These tests need access
 // to internal functions.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) &&  \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&  \
    !defined(OPENSSL_SMALL) && !defined(BORINGSSL_SHARED_LIBRARY)

-TEST(P256_X86_64Test, SelectW5) {
+TEST(P256_NistzTest, SelectW5) {
  // Fill a table with some garbage input.
  alignas(64) P256_POINT table[16];
  for (size_t i = 0; i < 16; i++) {
@ -68,7 +69,7 @@ TEST(P256_X86_64Test, SelectW5) {
  CHECK_ABI(ecp_nistz256_select_w5, &val, table, 7);
 }

-TEST(P256_X86_64Test, SelectW7) {
+TEST(P256_NistzTest, SelectW7) {
  // Fill a table with some garbage input.
  alignas(64) P256_POINT_AFFINE table[64];
  for (size_t i = 0; i < 64; i++) {
@ -97,11 +98,13 @@ TEST(P256_X86_64Test, SelectW7) {
  CHECK_ABI(ecp_nistz256_select_w7, &val, table, 42);
 }

-TEST(P256_X86_64Test, BEEU) {
+TEST(P256_NistzTest, BEEU) {
+#if defined(OPENSSL_X86_64)
  if (!CRYPTO_is_AVX_capable()) {
    // No AVX support; cannot run the BEEU code.
    return;
  }
+#endif

  bssl::UniquePtr<EC_GROUP> group(
      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
@ -483,8 +486,8 @@ static void TestOrdMulMont(FileTest *t) {
  }
 }

-TEST(P256_X86_64Test, TestVectors) {
-  return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt",
+TEST(P256_NistzTest, TestVectors) {
+  return FileTestGTest("crypto/fipsmodule/ec/p256-nistz_tests.txt",
                       [](FileTest *t) {
    if (t->GetParameter() == "Negate") {
      TestNegate(t);
@ -503,7 +506,7 @@ TEST(P256_X86_64Test, TestVectors) {
 }

 // Instrument the functions covered in TestVectors for ABI checking.
-TEST(P256_X86_64Test, ABI) {
+TEST(P256_NistzTest, ABI) {
  BN_ULONG a[P256_LIMBS], b[P256_LIMBS], c[P256_LIMBS];
  OPENSSL_memset(a, 0x01, sizeof(a));
  // These functions are all constant-time, so it is only necessary to
--- a/crypto/fipsmodule/ec/p256-x86_64_tests.txt
+++ b/crypto/fipsmodule/ec/p256-x86_64_tests.txt
--- a/sources.cmake
+++ b/sources.cmake
@ -49,7 +49,7 @@ set(
  crypto/fipsmodule/bn/bn_tests.txt
  crypto/fipsmodule/bn/miller_rabin_tests.txt
  crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt
-  crypto/fipsmodule/ec/p256-x86_64_tests.txt
+  crypto/fipsmodule/ec/p256-nistz_tests.txt
  crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt
  crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
  crypto/fipsmodule/modes/gcm_tests.txt
--- a/util/fipstools/delocate/delocate.go
+++ b/util/fipstools/delocate/delocate.go
@ -509,7 +509,7 @@ func (d *delocation) processAarch64Instruction(statement, instruction *node32) (
 				// This is a branch. Either the target needs to be written to a local
 				// version of the symbol to ensure that no relocations are emitted, or
 				// it needs to jump to a redirector function.
-				symbol, _, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
+				symbol, offset, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
 				changed = didChange

 				if _, knownSymbol := d.symbols[symbol]; knownSymbol {
@ -520,6 +520,13 @@ func (d *delocation) processAarch64Instruction(statement, instruction *node32) (
 					d.redirectors[symbol] = redirector
 					symbol = redirector
 					changed = true
+				} else if didChange && symbolIsLocal && len(offset) > 0 {
+					// didChange is set when the inputFile index is not 0; which is the index of the
+					// first file copied to the output, which is the generated assembly of bcm.c.
+					// In subsequently copied assembly files, local symbols are changed by appending (BCM_ + index)
+					// in order to ensure they don't collide. `index` gets incremented per file.
+					// If there is offset after the symbol, append the `offset`.
+					symbol = symbol + offset
 				}

 				args = append(args, symbol)
--- a/util/fipstools/delocate/delocate.peg
+++ b/util/fipstools/delocate/delocate.peg
@ -94,7 +94,7 @@ MemoryRef <- (SymbolRef BaseIndexScale /
              BaseIndexScale)
 SymbolRef <- (Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?
 Low12BitsSymbolRef <- ":lo12:" (LocalSymbol / SymbolName) Offset?
-ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
+ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / (('+' [0-9]+)*))? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
 ARMGOTLow12 <- ":got_lo12:" SymbolName
 ARMPostincrement <- '!'
 BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
--- a/util/fipstools/delocate/delocate.peg.go
+++ b/util/fipstools/delocate/delocate.peg.go