diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index 312c08007..1cb68ca13 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -518,7 +518,7 @@ add_executable( fipsmodule/aes/aes_test.cc fipsmodule/bn/bn_test.cc fipsmodule/ec/ec_test.cc - fipsmodule/ec/p256-x86_64_test.cc + fipsmodule/ec/p256-nistz_test.cc fipsmodule/ecdsa/ecdsa_test.cc fipsmodule/md5/md5_test.cc fipsmodule/modes/gcm_test.cc diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 73f8a02ae..b99ebc71c 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -64,6 +64,8 @@ if(ARCH STREQUAL "aarch64") armv8-mont.${ASM_EXT} ghash-neon-armv8.${ASM_EXT} ghashv8-armx.${ASM_EXT} + p256-armv8-asm.${ASM_EXT} + p256_beeu-armv8-asm.${ASM_EXT} sha1-armv8.${ASM_EXT} sha256-armv8.${ASM_EXT} sha512-armv8.${ASM_EXT} @@ -102,6 +104,8 @@ perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl) perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl) perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl) +perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl) +perlasm(p256_beeu-armv8-asm.${ASM_EXT} ec/asm/p256_beeu-armv8-asm.pl) perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl) perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl) perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl) diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c index 6f8f5c08f..87618fe21 100644 --- a/crypto/fipsmodule/bcm.c +++ b/crypto/fipsmodule/bcm.c @@ -71,7 +71,7 @@ #include "ec/oct.c" #include "ec/p224-64.c" #include "ec/p256.c" -#include "ec/p256-x86_64.c" +#include "ec/p256-nistz.c" #include "ec/scalar.c" #include "ec/simple.c" #include "ec/simple_mul.c" diff --git a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl new file mode 100644 index 000000000..f2926b8ee --- /dev/null +++ b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl @@ -0,0 +1,1702 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv8. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# Apple A7 +190-360% +# Cortex-A53 +190-400% +# Cortex-A57 +190-350% +# Denver +230-400% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +400% means 5x improvement. + +# The first two arguments should always be the flavour and output file path. +if ($#ARGV < 1) { die "Not enough arguments provided. + Two arguments are necessary: the flavour and the output file path."; } + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +{ +my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = + map("x$_",(0..17,19,20)); + +my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +#include "openssl/arm_arch.h" + +.text +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by " + +// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,%function +.align 6 +ecp_nistz256_to_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,.LRR // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.LRR // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,%function +.align 4 +ecp_nistz256_from_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + mov $bi,#1 // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.Lone // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + mov $t0,$a0 + mov $t1,$a1 + mov $t2,$a2 + mov $t3,$a3 + + bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $bp,$ap + mov $acc0,xzr // a = 0 + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $t3,$a3,$bi + ldr $bi,[$bp,#8] // b[1] + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adc $acc4,xzr,$t3 + mov $acc5,xzr +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + mul $t0,$a0,$bi // lo(a[0]*b[i]) + adcs $acc1,$acc2,$t1 + mul $t1,$a1,$bi // lo(a[1]*b[i]) + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + mul $t2,$a2,$bi // lo(a[2]*b[i]) + adcs $acc3,$acc4,$t3 + mul $t3,$a3,$bi // lo(a[3]*b[i]) + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts of multiplication + umulh $t0,$a0,$bi // hi(a[0]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi // hi(a[1]*b[i]) + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi // hi(a[2]*b[i]) + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi // hi(a[3]*b[i]) + adc $acc4,$acc4,xzr +___ +$code.=<<___ if ($i<3); + ldr $bi,[$bp,#8*($i+1)] // b[$i+1] +___ +$code.=<<___; + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + // last reduction + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adcs $acc3,$acc4,$t3 + adc $acc4,$acc5,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to $a0-$a3 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + lsl $t0,$acc0,#32 + adcs $acc6,$acc6,$t3 + lsr $t1,$acc0,#32 + adc $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + lsl $t0,$acc0,#32 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + lsr $t1,$acc0,#32 + adc $acc3,$t3,xzr // can't overflow +___ +} +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adc $acc3,$t3,xzr // can't overflow + + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// $a0-$a3 and $t0-$t3. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add_to,%function +.align 4 +__ecp_nistz256_add_to: + adds $acc0,$acc0,$t0 // ret = a+b + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + adc $ap,xzr,xzr // zap $ap + + adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$acc0,$t0 // ret = a-b + sbcs $acc1,$acc1,$t1 + sbcs $acc2,$acc2,$t2 + sbcs $acc3,$acc3,$t3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$t0,$acc0 // ret = b-a + sbcs $acc1,$t1,$acc1 + sbcs $acc2,$t2,$acc2 + sbcs $acc3,$t3,$acc3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adcs $t3,$acc3,$poly3 + adc $ap,xzr,xzr // zap $ap + tst $acc0,#1 // is a even? + + csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + csel $acc3,$acc3,$t3,eq + csel $ap,xzr,$ap,eq + + lsr $acc0,$acc0,#1 // ret >>= 1 + orr $acc0,$acc0,$acc1,lsl#63 + lsr $acc1,$acc1,#1 + orr $acc1,$acc1,$acc2,lsl#63 + lsr $acc2,$acc2,#1 + orr $acc2,$acc2,$acc3,lsl#63 + lsr $acc3,$acc3,#1 + stp $acc0,$acc1,[$rp] + orr $acc3,$acc3,$ap,lsl#63 + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("x$_",(21,22)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp $acc0,$acc1,[$ap,#32] + mov $rp_real,$rp + ldp $acc2,$acc3,[$ap,#48] + mov $ap_real,$ap + ldr $poly1,.Lpoly+8 + mov $t0,$acc0 + ldr $poly3,.Lpoly+24 + mov $t1,$acc1 + ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[$ap_real,#64+16] + add $rp,sp,#$S + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp $t0,$t1,[$ap_real] + ldp $t2,$t3,[$ap_real,#16] + mov $a0,$acc0 // put Zsqr aside for p256_sub + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add $bp,$ap_real,#0 + mov $acc0,$a0 // restore Zsqr + mov $acc1,$a1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $acc2,$a2 + mov $acc3,$a3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add $rp,sp,#$S + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[$ap_real,#64] + ldp $a2,$a3,[$ap_real,#64+16] + add $bp,$ap_real,#32 + add $rp,sp,#$tmp0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,$rp_real,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add $rp,sp,#$tmp0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$M] + ldp $a2,$a3,[sp,#$M+16] + add $rp,$rp_real,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add $bp,sp,#$Zsqr + add $rp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov $t0,$acc0 // duplicate M + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 // put M aside + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add_to + mov $t0,$a0 // restore M + mov $t1,$a1 + ldr $bi,[$ap_real] // forward load for p256_mul_mont + mov $t2,$a2 + ldp $a0,$a1,[sp,#$S] + mov $t3,$a3 + ldp $a2,$a3,[sp,#$S+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add $bp,$ap_real,#0 + add $rp,sp,#$S + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$M+16] + add $rp,sp,#$tmp0 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add $rp,$rp_real,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add $bp,sp,#$tmp0 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add $bp,sp,#$S + add $rp,sp,#$S + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr $bi,[sp,#$M] + mov $a0,$acc0 // copy S + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $bp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add $bp,$rp_real,#32 + add $rp,$rp_real,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp $a0,$a1,[$bp,#64] // in2_z + ldp $a2,$a3,[$bp,#64+16] + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in2infty,$t0,$t2 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + add $rp,sp,#$Z2sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp $a0,$a1,[$ap_real,#64] // in1_z + ldp $a2,$a3,[$ap_real,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$Z2sqr] + ldp $a2,$a3,[sp,#$Z2sqr+16] + add $bp,$bp_real,#64 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $bp,$ap_real,#32 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,sp,#$S1 + ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont + ldp $a0,$a1,[$ap_real] + ldp $a2,$a3,[$ap_real,#16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) + + add $bp,sp,#$Z2sqr + add $rp,sp,#$U1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr $bi,[sp,#$Z1sqr] + ldp $a0,$a1,[$bp_real] + ldp $a2,$a3,[$bp_real,#16] + add $bp,sp,#$Z1sqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add $bp,sp,#$U1 + ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) + + mvn $temp1,$in1infty // -1/0 -> 0/-1 + mvn $temp2,$in2infty // -1/0 -> 0/-1 + orr $acc0,$acc0,$temp1 + orr $acc0,$acc0,$temp2 + orr $acc0,$acc0,$temp0 + cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +.Ladd_double: + mov $ap,$ap_real + mov $rp,$rp_real + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$res_z] + ldp $a2,$a3,[sp,#$res_z+16] + add $bp,$bp_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[sp,#$Hsqr] + ldp $a0,$a1,[sp,#$U1] + ldp $a2,$a3,[sp,#$U1+16] + add $bp,sp,#$Hsqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,sp,#$Hcub + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + ldp $a0,$a1,[$ap,#64] // in1_z + ldp $a2,$a3,[$ap,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + + ldp $acc0,$acc1,[$bp] // in2_x + ldp $acc2,$acc3,[$bp,#16] + ldp $t0,$t1,[$bp,#32] // in2_y + ldp $t2,$t3,[$bp,#48] + orr $acc0,$acc0,$acc1 + orr $acc2,$acc2,$acc3 + orr $t0,$t0,$t1 + orr $t2,$t2,$t3 + orr $acc0,$acc0,$acc2 + orr $t0,$t0,$t2 + orr $in2infty,$acc0,$t0 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + ldr $bi,[$bp_real] + add $bp,$bp_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add $bp,$ap_real,#0 + ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,$ap_real,#32 + ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp $a0,$a1,[sp,#$R] + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[$ap_real] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,$ap_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Hcub] + ldp $a2,$a3,[sp,#$Hcub+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,$ap_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +$code.=<<___ if ($i == 0); + adr $bp_real,.Lone_mont-64 +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ord0,$ord1) = ($poly1,$poly3); +my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $acc7 = $bi; + +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $acc4,$a3,$bi + + mul $t4,$acc0,$ordk + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adc $acc4,$acc4,xzr + mov $acc5,xzr +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ldr $bi,[$bp,#8*$i] // b[i] + + lsl $t0,$t4,#32 + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + mul $t0,$a0,$bi + adc $t3,$t3,xzr + mul $t1,$a1,$bi + + adds $acc0,$acc1,$t2 + mul $t2,$a2,$bi + adcs $acc1,$acc2,$t3 + mul $t3,$a3,$bi + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts + umulh $t0,$a0,$bi + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,$acc4,xzr + mul $t4,$acc0,$ordk + adds $acc1,$acc1,$t0 // accumulate high parts + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + lsl $t0,$t4,#32 // last reduction + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// int rep); +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub $bp,$bp,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + mul $t4,$acc0,$ordk + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + adcs $acc6,$acc6,$t3 + adc $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adc $acc3,xzr,$t4 // can't overflow +___ +$code.=<<___ if ($i<3); + mul $t3,$acc0,$ordk +___ +$code.=<<___; + lsl $t0,$t4,#32 + subs $acc1,$acc1,$t4 + lsr $t1,$t4,#32 + sbcs $acc2,$acc2,$t0 + sbc $acc3,$acc3,$t1 // can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $a1,$acc1,$t1,lo + csel $a2,$acc2,$t2,lo + csel $a3,$acc3,$t3,lo + + cbnz $bp,.Loop_ord_sqr + + stp $a0,$a1,[$rp] + stp $a2,$a3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# select subroutines +# These select functions are similar to those in p256-x86_64-asm.pl +# They load all points in the lookup table +# keeping in the output only the one corresponding to the input index. +{ +my ($val,$in_t)=map("x$_",(0..1)); +my ($index)=("w2"); +my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11"); +my ($Mask)=("v3"); +my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21)); +my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27)); +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,%function +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // $Val_in := $val + // $Idx_ctr := 0; loop counter and incremented internal index + mov $Val_in, $val + mov $Idx_ctr, #0 + + // [$Ra-$Rf] := 0 + movi $Ra.16b, #0 + movi $Rb.16b, #0 + movi $Rc.16b, #0 + movi $Rd.16b, #0 + movi $Re.16b, #0 + movi $Rf.16b, #0 + +.Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add $Idx_ctr, $Idx_ctr, #1 + + // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t + // and advance $in_t to point to the next entry + ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 + + // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s + cmp $Idx_ctr, $index + csetm $Mask_64, eq + + // continue loading ... + ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup $Mask.2d, $Mask_64 + + // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] + // i.e., values in output registers will remain the same if $Idx_ctr != $index + bit $Ra.16b, $T0a.16b, $Mask.16b + bit $Rb.16b, $T0b.16b, $Mask.16b + + bit $Rc.16b, $T0c.16b, $Mask.16b + bit $Rd.16b, $T0d.16b, $Mask.16b + + bit $Re.16b, $T0e.16b, $Mask.16b + bit $Rf.16b, $T0f.16b, $Mask.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz $Idx_ctr, #4, .Lselect_w5_loop + + // Write [$Ra-$Rf] to memory at the output pointer + st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64 + st1 {$Re.2d, $Rf.2d}, [$Val_in] + + ret +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,%function +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // $Idx_ctr := 0; loop counter and incremented internal index + mov $Idx_ctr, #0 + + // [$Ra-$Rf] := 0 + movi $Ra.16b, #0 + movi $Rb.16b, #0 + movi $Rc.16b, #0 + movi $Rd.16b, #0 + +.Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add $Idx_ctr, $Idx_ctr, #1 + + // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t + // and advance $in_t to point to the next entry + ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 + + // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s + cmp $Idx_ctr, $index + csetm $Mask_64, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup $Mask.2d, $Mask_64 + + // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] + // i.e., values in output registers will remain the same if $Idx_ctr != $index + bit $Ra.16b, $T0a.16b, $Mask.16b + bit $Rb.16b, $T0b.16b, $Mask.16b + + bit $Rc.16b, $T0c.16b, $Mask.16b + bit $Rd.16b, $T0d.16b, $Mask.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz $Idx_ctr, #6, .Lselect_w7_loop + + // Write [$Ra-$Rd] to memory at the output pointer + st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val] + + ret +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl new file mode 100644 index 000000000..e259aeff3 --- /dev/null +++ b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl @@ -0,0 +1,455 @@ +# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +# +# +# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd). +# + +# The first two arguments should always be the flavour and output file path. +if ($#ARGV < 1) { die "Not enough arguments provided. + Two arguments are necessary: the flavour and the output file path."; } + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; +############################################################################# +# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], +# BN_ULONG a[P256_LIMBS], +# BN_ULONG n[P256_LIMBS]); +# +# (Binary Extended GCD (Euclidean) Algorithm. +# See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography, +# Chapter 14, Algorithm 14.61 and Note 14.64 +# http://cacr.uwaterloo.ca/hac/about/chap14.pdf) + +# Assumption 1: n is odd for the BEEU +# Assumption 2: 1 < a < n < 2^256 + +# Details +# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse. +# In other words, +# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“) +# a == x^{-1} (mod y) +# +# It can be shown that throughout all the iterations of the algorithm, the following holds: +# u = Ax + By +# v = Cx + Dy +# The values B and D are not of interest in this case, so they need not be computed by the algorithm. +# This means the following congruences hold through the iterations of the algorithm. +# Ax == u (mod y) +# Cx == v (mod y) + +# Now we will modify the notation to match that of BN_mod_inverse_odd() +# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based. +# In those functions: +# x, y -> a, n +# u, v -> B, A +# A, C -> X, Y’, where Y’ = -Y +# Hence, the following holds throughout the algorithm iterations +# Xa == B (mod n) +# -Ya == A (mod n) +# +# Same algorithm in Python: +# def beeu(a, n): +# X = 1 +# Y = 0 +# B = a +# A = n +# while (B != 0): +# while (B % 2) == 0: +# B >>= 1 +# if (X % 2) == 1: +# X = X + n +# X >>= 1 +# while (A % 2) == 0: +# A >>= 1 +# if (Y % 2) == 1: +# Y = Y + n +# Y >>= 1 +# if (B >= A): +# B = B - A +# X = X + Y +# else: +# A = A - B +# Y = Y + X +# if (A != 1): +# # error +# return 0 +# else: +# while (Y > n): +# Y = Y - n +# Y = n - Y +# return Y + + +# For the internal variables, +# x0-x2, x30 are used to hold the modulus n. The input parameters passed in +# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack. +# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible +# x8 is corruptible here +# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address +# for that struct). +# x9-x15 are corruptible registers +# x19-x28 are callee-saved registers + +# X/Y will hold the inverse parameter +# Assumption: a,n,X,Y < 2^(256) +# Initially, X := 1, Y := 0 +# A := n, B := a + +# Function parameters (as per the Procedure Call Standard) +my($out, $a_in, $n_in)=map("x$_",(0..2)); +# Internal variables +my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30)); +my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7)); +my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12)); +my($shift)=("x13"); +my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20)); +my($a0, $a1, $a2, $a3)=map("x$_",(21..24)); +my($b0, $b1, $b2, $b3)=map("x$_",(25..28)); + +# if B == 0, jump to end of loop +sub TEST_B_ZERO { + return <<___; + orr $t0, $b0, $b1 + orr $t0, $t0, $b2 + + // reverse the bit order of $b0. This is needed for clz after this macro + rbit $t1, $b0 + + orr $t0, $t0, $b3 + cbz $t0,.Lbeeu_loop_end +___ +} + +# Shift right by 1 bit, adding the modulus first if the variable is odd +# if least_sig_bit(var0) == 0, +# goto shift1_ +# else +# add n and goto shift1_ +# Prerequisite: t0 = 0 +$g_next_label = 0; +sub SHIFT1 { + my ($var0, $var1, $var2, $var3, $var4) = @_; + my $label = ".Lshift1_${g_next_label}"; + $g_next_label++; + return <<___; + tbz $var0, #0, $label + adds $var0, $var0, $n0 + adcs $var1, $var1, $n1 + adcs $var2, $var2, $n2 + adcs $var3, $var3, $n3 + adc $var4, $var4, $t0 +$label: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr $var0, $var1, $var0, #1 + extr $var1, $var2, $var1, #1 + extr $var2, $var3, $var2, #1 + extr $var3, $var4, $var3, #1 + lsr $var4, $var4, #1 +___ +} + +# compilation by clang 10.0.0 with -O2/-O3 of +# a[0] = (a[0] >> count) | (a[1] << (64-count)); +# a[1] = (a[1] >> count) | (a[2] << (64-count)); +# a[2] = (a[2] >> count) | (a[3] << (64-count)); +# a[3] >>= count; +# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ +# except that the second source operand of EXTR is only immediate; +# that's why it cannot be used here where $shift is a variable +# +# In the following, +# t0 := 0 - shift +# +# then var0, for example, will be shifted right as follows: +# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64)) +# "uint() mod 64" is from the definition of LSL and LSR instructions. +# +# What matters here is the order of instructions relative to certain other +# instructions, i.e. +# - lsr and lsl must precede orr of the corresponding registers. +# - lsl must preced the lsr of the same register afterwards. +# The chosen order of the instructions overall is to try and maximize +# the pipeline usage. +sub SHIFT256 { + my ($var0, $var1, $var2, $var3) = @_; + return <<___; + neg $t0, $shift + lsr $var0, $var0, $shift + lsl $t1, $var1, $t0 + + lsr $var1, $var1, $shift + lsl $t2, $var2, $t0 + + orr $var0, $var0, $t1 + + lsr $var2, $var2, $shift + lsl $t3, $var3, $t0 + + orr $var1, $var1, $t2 + + lsr $var3, $var3, $shift + + orr $var2, $var2, $t3 +___ +} + +$code.=<<___; +#include "openssl/arm_arch.h" + +.text +.globl beeu_mod_inverse_vartime +.type beeu_mod_inverse_vartime, %function +.align 4 +beeu_mod_inverse_vartime: + // Reserve enough space for 14 8-byte registers on the stack + // in the first stp call for x29, x30. + // Then store the remaining callee-saved registers. + // + // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | + // ^ ^ + // sp <------------------- 112 bytes ----------------> old sp + // x29 (FP) + // + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-112]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x2,[sp,#96] + + // B = b3..b0 := a + ldp $b0,$b1,[$a_in] + ldp $b2,$b3,[$a_in,#16] + + // n3..n0 := n + // Note: the value of input params are changed in the following. + ldp $n0,$n1,[$n_in] + ldp $n2,$n3,[$n_in,#16] + + // A = a3..a0 := n + mov $a0, $n0 + mov $a1, $n1 + mov $a2, $n2 + mov $a3, $n3 + + // X = x4..x0 := 1 + mov $x0, #1 + eor $x1, $x1, $x1 + eor $x2, $x2, $x2 + eor $x3, $x3, $x3 + eor $x4, $x4, $x4 + + // Y = y4..y0 := 0 + eor $y0, $y0, $y0 + eor $y1, $y1, $y1 + eor $y2, $y2, $y2 + eor $y3, $y3, $y3 + eor $y4, $y4, $y4 + +.Lbeeu_loop: + // if B == 0, jump to .Lbeeu_loop_end + ${\TEST_B_ZERO} + + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) X*a == B (mod |n|), + // (2) (-1)*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the + // integers, and divide X by the same value mod |n|. + // When we're done, (1) still holds. + + // shift := number of trailing 0s in $b0 + // ( = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO) + clz $shift, $t1 + + // If there is no shift, goto shift_A_Y + cbz $shift, .Lbeeu_shift_A_Y + + // Shift B right by "$shift" bits + ${\SHIFT256($b0, $b1, $b2, $b3)} + + // Shift X right by "$shift" bits, adding n whenever X becomes odd. + // $shift--; + // $t0 := 0; needed in the addition to the most significant word in SHIFT1 + eor $t0, $t0, $t0 +.Lbeeu_shift_loop_X: + ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} + subs $shift, $shift, #1 + bne .Lbeeu_shift_loop_X + + // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl + // with the following differences: + // - "$shift" is set directly to the number of trailing 0s in B + // (using rbit and clz instructions) + // - The loop is only used to call SHIFT1(X) + // and $shift is decreased while executing the X loop. + // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent + +.Lbeeu_shift_A_Y: + // Same for A and Y. + // Afterwards, (2) still holds. + // Reverse the bit order of $a0 + // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1) + rbit $t1, $a0 + clz $shift, $t1 + + // If there is no shift, goto |B-A|, X+Y update + cbz $shift, .Lbeeu_update_B_X_or_A_Y + + // Shift A right by "$shift" bits + ${\SHIFT256($a0, $a1, $a2, $a3)} + + // Shift Y right by "$shift" bits, adding n whenever Y becomes odd. + // $shift--; + // $t0 := 0; needed in the addition to the most significant word in SHIFT1 + eor $t0, $t0, $t0 +.Lbeeu_shift_loop_Y: + ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} + subs $shift, $shift, #1 + bne .Lbeeu_shift_loop_Y + +.Lbeeu_update_B_X_or_A_Y: + // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) + // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words + // without taking a sign bit if generated. The lack of a carry would + // indicate a negative result. See, for example, + // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes + subs $t0, $b0, $a0 + sbcs $t1, $b1, $a1 + sbcs $t2, $b2, $a2 + sbcs $t3, $b3, $a3 + bcs .Lbeeu_B_greater_than_A + + // Else A > B => + // A := A - B; Y := Y + X; goto beginning of the loop + subs $a0, $a0, $b0 + sbcs $a1, $a1, $b1 + sbcs $a2, $a2, $b2 + sbcs $a3, $a3, $b3 + + adds $y0, $y0, $x0 + adcs $y1, $y1, $x1 + adcs $y2, $y2, $x2 + adcs $y3, $y3, $x3 + adc $y4, $y4, $x4 + b .Lbeeu_loop + +.Lbeeu_B_greater_than_A: + // Continue with B > A => + // B := B - A; X := X + Y; goto beginning of the loop + mov $b0, $t0 + mov $b1, $t1 + mov $b2, $t2 + mov $b3, $t3 + + adds $x0, $x0, $y0 + adcs $x1, $x1, $y1 + adcs $x2, $x2, $y2 + adcs $x3, $x3, $y3 + adc $x4, $x4, $y4 + b .Lbeeu_loop + +.Lbeeu_loop_end: + // The Euclid's algorithm loop ends when A == gcd(a,n); + // this would be 1, when a and n are co-prime (i.e. do not have a common factor). + // Since (-1)*Y*a == A (mod |n|), Y>0 + // then out = -Y mod n + + // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) + // Is A-1 == 0? + // If not, fail. + sub $t0, $a0, #1 + orr $t0, $t0, $a1 + orr $t0, $t0, $a2 + orr $t0, $t0, $a3 + cbnz $t0, .Lbeeu_err + + // If Y>n ==> Y:=Y-n +.Lbeeu_reduction_loop: + // x_i := y_i - n_i (X is no longer needed, use it as temp) + // ($t0 = 0 from above) + subs $x0, $y0, $n0 + sbcs $x1, $y1, $n1 + sbcs $x2, $y2, $n2 + sbcs $x3, $y3, $n3 + sbcs $x4, $y4, $t0 + + // If result is non-negative (i.e., cs = carry set = no borrow), + // y_i := x_i; goto reduce again + // else + // y_i := y_i; continue + csel $y0, $x0, $y0, cs + csel $y1, $x1, $y1, cs + csel $y2, $x2, $y2, cs + csel $y3, $x3, $y3, cs + csel $y4, $x4, $y4, cs + bcs .Lbeeu_reduction_loop + + // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) + // out = -Y = n-Y + subs $y0, $n0, $y0 + sbcs $y1, $n1, $y1 + sbcs $y2, $n2, $y2 + sbcs $y3, $n3, $y3 + + // Save Y in output (out (x0) was saved on the stack) + ldr x3, [sp,#96] + stp $y0, $y1, [x3] + stp $y2, $y3, [x3,#16] + // return 1 (success) + mov x0, #1 + b .Lbeeu_finish + +.Lbeeu_err: + // return 0 (error) + eor x0, x0, x0 + +.Lbeeu_finish: + // Restore callee-saved registers, except x0, x2 + add sp,x29,#0 + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldp x29,x30,[sp],#112 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime +___ + + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c index 93fdcfc61..133f561c8 100644 --- a/crypto/fipsmodule/ec/ec.c +++ b/crypto/fipsmodule/ec/ec.c @@ -246,7 +246,8 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) { out->curves[2].param_len = 32; out->curves[2].params = kP256Params; out->curves[2].method = -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) EC_GFp_nistz256_method(); #else diff --git a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go b/crypto/fipsmodule/ec/make_p256-nistz-tests.go similarity index 96% rename from crypto/fipsmodule/ec/make_p256-x86_64-tests.go rename to crypto/fipsmodule/ec/make_p256-nistz-tests.go index 958a97a53..36194e61b 100644 --- a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go +++ b/crypto/fipsmodule/ec/make_p256-nistz-tests.go @@ -69,7 +69,7 @@ func fromMontgomery(z, x *big.Int) *big.Int { func isAffineInfinity(x, y *big.Int) bool { // Infinity, in affine coordinates, is represented as (0, 0) by - // both Go and p256-x86_64-asm.pl. + // both Go, p256-x86_64-asm.pl and p256-armv8-asm.pl. return x.Sign() == 0 && y.Sign() == 0 } @@ -107,8 +107,8 @@ func toJacobian(xIn, yIn *big.Int) (x, y, z *big.Int) { // arbitrary X and Y and include the special case. We also have // not verified that add and double preserve this // property. Thus, generate test vectors with unrelated X and Y, - // to test that p256-x86_64-asm.pl correctly handles - // unconstrained representations of infinity. + // to test that p256-x86_64-asm.pl and p256-armv8-asm.pl correctly + // handle unconstrained representations of infinity. x = randNonZeroInt(p) y = randNonZeroInt(p) z = zero diff --git a/crypto/fipsmodule/ec/make_tables.go b/crypto/fipsmodule/ec/make_tables.go index 34e8c23ac..dbcaab06f 100644 --- a/crypto/fipsmodule/ec/make_tables.go +++ b/crypto/fipsmodule/ec/make_tables.go @@ -23,8 +23,8 @@ import ( ) func main() { - if err := writeP256X86_64Table("p256-x86_64-table.h"); err != nil { - fmt.Fprintf(os.Stderr, "Error writing p256-x86_64-table.h: %s\n", err) + if err := writeP256NistzTable("p256-nistz-table.h"); err != nil { + fmt.Fprintf(os.Stderr, "Error writing p256-nistz-table.h: %s\n", err) os.Exit(1) } @@ -34,7 +34,7 @@ func main() { } } -func writeP256X86_64Table(path string) error { +func writeP256NistzTable(path string) error { curve := elliptic.P256() tables := make([][][2]*big.Int, 0, 37) for shift := 0; shift < 256; shift += 7 { @@ -59,7 +59,7 @@ func writeP256X86_64Table(path string) error { */ // This is the precomputed constant time access table for the code in -// p256-x86_64.c, for the default generator. The table consists of 37 +// p256-nistz.c, for the default generator. The table consists of 37 // subtables, each subtable contains 64 affine points. The affine points are // encoded as eight uint64's, four for the x coordinate and four for the y. // Both values are in little-endian order. There are 37 tables because a diff --git a/crypto/fipsmodule/ec/p256-x86_64-table.h b/crypto/fipsmodule/ec/p256-nistz-table.h similarity index 99% rename from crypto/fipsmodule/ec/p256-x86_64-table.h rename to crypto/fipsmodule/ec/p256-nistz-table.h index 3af0b0166..b81480bd1 100644 --- a/crypto/fipsmodule/ec/p256-x86_64-table.h +++ b/crypto/fipsmodule/ec/p256-nistz-table.h @@ -9,7 +9,7 @@ */ // This is the precomputed constant time access table for the code in -// p256-x86_64.c, for the default generator. The table consists of 37 +// p256-nistz.c, for the default generator. The table consists of 37 // subtables, each subtable contains 64 affine points. The affine points are // encoded as eight uint64's, four for the x coordinate and four for the y. // Both values are in little-endian order. There are 37 tables because a diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-nistz.c similarity index 98% rename from crypto/fipsmodule/ec/p256-x86_64.c rename to crypto/fipsmodule/ec/p256-nistz.c index 506b7d2ce..12a4da6e0 100644 --- a/crypto/fipsmodule/ec/p256-x86_64.c +++ b/crypto/fipsmodule/ec/p256-nistz.c @@ -30,10 +30,10 @@ #include "../delocate.h" #include "../../internal.h" #include "internal.h" -#include "p256-x86_64.h" +#include "p256-nistz.h" - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; @@ -45,7 +45,7 @@ static const BN_ULONG ONE[P256_LIMBS] = { }; // Precomputed tables for the default generator -#include "p256-x86_64-table.h" +#include "p256-nistz-table.h" // Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in // util.c for details @@ -554,10 +554,12 @@ static void ecp_nistz256_inv0_mod_ord(const EC_GROUP *group, EC_SCALAR *out, static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, EC_SCALAR *out, const EC_SCALAR *in) { +#if defined(OPENSSL_X86_64) if (!CRYPTO_is_AVX_capable()) { // No AVX support; fallback to generic code. return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in); } +#endif assert(group->order.width == P256_LIMBS); if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) { @@ -628,5 +630,6 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate; } -#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ +#endif /* !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) */ diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-nistz.h similarity index 95% rename from crypto/fipsmodule/ec/p256-x86_64.h rename to crypto/fipsmodule/ec/p256-nistz.h index 5deb81a3f..0d0a6bea4 100644 --- a/crypto/fipsmodule/ec/p256-x86_64.h +++ b/crypto/fipsmodule/ec/p256-nistz.h @@ -30,7 +30,8 @@ extern "C" { #endif -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) // P-256 field operations. @@ -142,8 +143,9 @@ void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); -#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ - !defined(OPENSSL_SMALL) */ +#endif /* !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_SMALL) */ #if defined(__cplusplus) diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-nistz_test.cc similarity index 98% rename from crypto/fipsmodule/ec/p256-x86_64_test.cc rename to crypto/fipsmodule/ec/p256-nistz_test.cc index f6f070a30..73944db2c 100644 --- a/crypto/fipsmodule/ec/p256-x86_64_test.cc +++ b/crypto/fipsmodule/ec/p256-nistz_test.cc @@ -30,15 +30,16 @@ #include "../../test/abi_test.h" #include "../../test/file_test.h" #include "../../test/test_util.h" -#include "p256-x86_64.h" +#include "p256-nistz.h" // Disable tests if BORINGSSL_SHARED_LIBRARY is defined. These tests need access // to internal functions. -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) && !defined(BORINGSSL_SHARED_LIBRARY) -TEST(P256_X86_64Test, SelectW5) { +TEST(P256_NistzTest, SelectW5) { // Fill a table with some garbage input. alignas(64) P256_POINT table[16]; for (size_t i = 0; i < 16; i++) { @@ -68,7 +69,7 @@ TEST(P256_X86_64Test, SelectW5) { CHECK_ABI(ecp_nistz256_select_w5, &val, table, 7); } -TEST(P256_X86_64Test, SelectW7) { +TEST(P256_NistzTest, SelectW7) { // Fill a table with some garbage input. alignas(64) P256_POINT_AFFINE table[64]; for (size_t i = 0; i < 64; i++) { @@ -97,11 +98,13 @@ TEST(P256_X86_64Test, SelectW7) { CHECK_ABI(ecp_nistz256_select_w7, &val, table, 42); } -TEST(P256_X86_64Test, BEEU) { +TEST(P256_NistzTest, BEEU) { +#if defined(OPENSSL_X86_64) if (!CRYPTO_is_AVX_capable()) { // No AVX support; cannot run the BEEU code. return; } +#endif bssl::UniquePtr group( EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1)); @@ -483,8 +486,8 @@ static void TestOrdMulMont(FileTest *t) { } } -TEST(P256_X86_64Test, TestVectors) { - return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt", +TEST(P256_NistzTest, TestVectors) { + return FileTestGTest("crypto/fipsmodule/ec/p256-nistz_tests.txt", [](FileTest *t) { if (t->GetParameter() == "Negate") { TestNegate(t); @@ -503,7 +506,7 @@ TEST(P256_X86_64Test, TestVectors) { } // Instrument the functions covered in TestVectors for ABI checking. -TEST(P256_X86_64Test, ABI) { +TEST(P256_NistzTest, ABI) { BN_ULONG a[P256_LIMBS], b[P256_LIMBS], c[P256_LIMBS]; OPENSSL_memset(a, 0x01, sizeof(a)); // These functions are all constant-time, so it is only necessary to diff --git a/crypto/fipsmodule/ec/p256-x86_64_tests.txt b/crypto/fipsmodule/ec/p256-nistz_tests.txt similarity index 100% rename from crypto/fipsmodule/ec/p256-x86_64_tests.txt rename to crypto/fipsmodule/ec/p256-nistz_tests.txt diff --git a/sources.cmake b/sources.cmake index 3d3465f17..434b3c232 100644 --- a/sources.cmake +++ b/sources.cmake @@ -49,7 +49,7 @@ set( crypto/fipsmodule/bn/bn_tests.txt crypto/fipsmodule/bn/miller_rabin_tests.txt crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt - crypto/fipsmodule/ec/p256-x86_64_tests.txt + crypto/fipsmodule/ec/p256-nistz_tests.txt crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt crypto/fipsmodule/modes/gcm_tests.txt diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go index 5d4b1f495..55c86715e 100644 --- a/util/fipstools/delocate/delocate.go +++ b/util/fipstools/delocate/delocate.go @@ -509,7 +509,7 @@ func (d *delocation) processAarch64Instruction(statement, instruction *node32) ( // This is a branch. Either the target needs to be written to a local // version of the symbol to ensure that no relocations are emitted, or // it needs to jump to a redirector function. - symbol, _, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up) + symbol, offset, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up) changed = didChange if _, knownSymbol := d.symbols[symbol]; knownSymbol { @@ -520,6 +520,13 @@ func (d *delocation) processAarch64Instruction(statement, instruction *node32) ( d.redirectors[symbol] = redirector symbol = redirector changed = true + } else if didChange && symbolIsLocal && len(offset) > 0 { + // didChange is set when the inputFile index is not 0; which is the index of the + // first file copied to the output, which is the generated assembly of bcm.c. + // In subsequently copied assembly files, local symbols are changed by appending (BCM_ + index) + // in order to ensure they don't collide. `index` gets incremented per file. + // If there is offset after the symbol, append the `offset`. + symbol = symbol + offset } args = append(args, symbol) diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg index c253a4822..82670654f 100644 --- a/util/fipstools/delocate/delocate.peg +++ b/util/fipstools/delocate/delocate.peg @@ -94,7 +94,7 @@ MemoryRef <- (SymbolRef BaseIndexScale / BaseIndexScale) SymbolRef <- (Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)? Low12BitsSymbolRef <- ":lo12:" (LocalSymbol / SymbolName) Offset? -ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement? +ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / (('+' [0-9]+)*))? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement? ARMGOTLow12 <- ":got_lo12:" SymbolName ARMPostincrement <- '!' BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')' diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go index ea7c195e6..6f5c65455 100644 --- a/util/fipstools/delocate/delocate.peg.go +++ b/util/fipstools/delocate/delocate.peg.go @@ -1,10 +1,14 @@ package main +// Code generated by ./peg/peg delocate.peg DO NOT EDIT. + import ( "fmt" - "math" + "io" + "os" "sort" "strconv" + "strings" ) const endSymbol rune = 1114112 @@ -142,19 +146,19 @@ type node32 struct { up, next *node32 } -func (node *node32) print(pretty bool, buffer string) { +func (node *node32) print(w io.Writer, pretty bool, buffer string) { var print func(node *node32, depth int) print = func(node *node32, depth int) { for node != nil { for c := 0; c < depth; c++ { - fmt.Printf(" ") + fmt.Fprintf(w, " ") } rule := rul3s[node.pegRule] quote := strconv.Quote(string(([]rune(buffer)[node.begin:node.end]))) if !pretty { - fmt.Printf("%v %v\n", rule, quote) + fmt.Fprintf(w, "%v %v\n", rule, quote) } else { - fmt.Printf("\x1B[34m%v\x1B[m %v\n", rule, quote) + fmt.Fprintf(w, "\x1B[36m%v\x1B[m %v\n", rule, quote) } if node.up != nil { print(node.up, depth+1) @@ -165,12 +169,12 @@ func (node *node32) print(pretty bool, buffer string) { print(node, 0) } -func (node *node32) Print(buffer string) { - node.print(false, buffer) +func (node *node32) Print(w io.Writer, buffer string) { + node.print(w, false, buffer) } -func (node *node32) PrettyPrint(buffer string) { - node.print(true, buffer) +func (node *node32) PrettyPrint(w io.Writer, buffer string) { + node.print(w, true, buffer) } type tokens32 struct { @@ -213,24 +217,24 @@ func (t *tokens32) AST() *node32 { } func (t *tokens32) PrintSyntaxTree(buffer string) { - t.AST().Print(buffer) + t.AST().Print(os.Stdout, buffer) +} + +func (t *tokens32) WriteSyntaxTree(w io.Writer, buffer string) { + t.AST().Print(w, buffer) } func (t *tokens32) PrettyPrintSyntaxTree(buffer string) { - t.AST().PrettyPrint(buffer) + t.AST().PrettyPrint(os.Stdout, buffer) } func (t *tokens32) Add(rule pegRule, begin, end, index uint32) { - if tree := t.tree; int(index) >= len(tree) { - expanded := make([]token32, 2*len(tree)) - copy(expanded, tree) - t.tree = expanded - } - t.tree[index] = token32{ - pegRule: rule, - begin: begin, - end: end, + tree, i := t.tree, int(index) + if i >= len(tree) { + t.tree = append(tree, token32{pegRule: rule, begin: begin, end: end}) + return } + tree[i] = token32{pegRule: rule, begin: begin, end: end} } func (t *tokens32) Tokens() []token32 { @@ -292,7 +296,7 @@ type parseError struct { } func (e *parseError) Error() string { - tokens, error := []token32{e.max}, "\n" + tokens, err := []token32{e.max}, "\n" positions, p := make([]int, 2*len(tokens)), 0 for _, token := range tokens { positions[p], p = int(token.begin), p+1 @@ -305,14 +309,14 @@ func (e *parseError) Error() string { } for _, token := range tokens { begin, end := int(token.begin), int(token.end) - error += fmt.Sprintf(format, + err += fmt.Sprintf(format, rul3s[token.pegRule], translations[begin].line, translations[begin].symbol, translations[end].line, translations[end].symbol, strconv.Quote(string(e.p.buffer[begin:end]))) } - return error + return err } func (p *Asm) PrintSyntaxTree() { @@ -323,12 +327,41 @@ func (p *Asm) PrintSyntaxTree() { } } -func (p *Asm) Init() { +func (p *Asm) WriteSyntaxTree(w io.Writer) { + p.tokens32.WriteSyntaxTree(w, p.Buffer) +} + +func (p *Asm) SprintSyntaxTree() string { + var bldr strings.Builder + p.WriteSyntaxTree(&bldr) + return bldr.String() +} + +func Pretty(pretty bool) func(*Asm) error { + return func(p *Asm) error { + p.Pretty = pretty + return nil + } +} + +func Size(size int) func(*Asm) error { + return func(p *Asm) error { + p.tokens32 = tokens32{tree: make([]token32, 0, size)} + return nil + } +} +func (p *Asm) Init(options ...func(*Asm) error) error { var ( max token32 position, tokenIndex uint32 buffer []rune ) + for _, option := range options { + err := option(p) + if err != nil { + return err + } + } p.reset = func() { max = token32{} position, tokenIndex = 0, 0 @@ -342,7 +375,7 @@ func (p *Asm) Init() { p.reset() _rules := p.rules - tree := tokens32{tree: make([]token32, math.MaxInt16)} + tree := p.tokens32 p.parse = func(rule ...int) error { r := 1 if len(rule) > 0 { @@ -5708,7 +5741,7 @@ func (p *Asm) Init() { position, tokenIndex = position727, tokenIndex727 return false }, - /* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */ + /* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / ('+' [0-9]+)*)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */ func() bool { position737, tokenIndex737 := position, tokenIndex { @@ -5747,27 +5780,108 @@ func (p *Asm) Init() { } { position745, tokenIndex745 := position, tokenIndex - if buffer[position] != rune('*') { - goto l745 - } - position++ - if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l745 - } - position++ - l747: { - position748, tokenIndex748 := position, tokenIndex + position747, tokenIndex747 := position, tokenIndex + if buffer[position] != rune('*') { + goto l748 + } + position++ if c := buffer[position]; c < rune('0') || c > rune('9') { goto l748 } position++ + l749: + { + position750, tokenIndex750 := position, tokenIndex + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l750 + } + position++ + goto l749 + l750: + position, tokenIndex = position750, tokenIndex750 + } goto l747 l748: - position, tokenIndex = position748, tokenIndex748 + position, tokenIndex = position747, tokenIndex747 + if buffer[position] != rune('*') { + goto l751 + } + position++ + if buffer[position] != rune('(') { + goto l751 + } + position++ + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l751 + } + position++ + l752: + { + position753, tokenIndex753 := position, tokenIndex + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l753 + } + position++ + goto l752 + l753: + position, tokenIndex = position753, tokenIndex753 + } + if !_rules[ruleOperator]() { + goto l751 + } + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l751 + } + position++ + l754: + { + position755, tokenIndex755 := position, tokenIndex + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l755 + } + position++ + goto l754 + l755: + position, tokenIndex = position755, tokenIndex755 + } + if buffer[position] != rune(')') { + goto l751 + } + position++ + goto l747 + l751: + position, tokenIndex = position747, tokenIndex747 + l756: + { + position757, tokenIndex757 := position, tokenIndex + if buffer[position] != rune('+') { + goto l757 + } + position++ + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l757 + } + position++ + l758: + { + position759, tokenIndex759 := position, tokenIndex + if c := buffer[position]; c < rune('0') || c > rune('9') { + goto l759 + } + position++ + goto l758 + l759: + position, tokenIndex = position759, tokenIndex759 + } + goto l756 + l757: + position, tokenIndex = position757, tokenIndex757 + } } + l747: goto l746 - l745: + position, tokenIndex = position745, tokenIndex745 } l746: @@ -5775,16 +5889,16 @@ func (p *Asm) Init() { l744: position, tokenIndex = position743, tokenIndex743 if !_rules[ruleARMGOTLow12]() { - goto l749 + goto l760 } goto l743 - l749: + l760: position, tokenIndex = position743, tokenIndex743 if !_rules[ruleLow12BitsSymbolRef]() { - goto l750 + goto l761 } goto l743 - l750: + l761: position, tokenIndex = position743, tokenIndex743 if !_rules[ruleARMRegister]() { goto l739 @@ -5792,29 +5906,29 @@ func (p *Asm) Init() { } l743: { - position751, tokenIndex751 := position, tokenIndex + position762, tokenIndex762 := position, tokenIndex if buffer[position] != rune(',') { - goto l751 + goto l762 } position++ { - position753, tokenIndex753 := position, tokenIndex + position764, tokenIndex764 := position, tokenIndex if !_rules[ruleWS]() { - goto l753 + goto l764 } - goto l754 - l753: - position, tokenIndex = position753, tokenIndex753 + goto l765 + l764: + position, tokenIndex = position764, tokenIndex764 } - l754: + l765: if !_rules[ruleARMConstantTweak]() { - goto l751 + goto l762 } - goto l752 - l751: - position, tokenIndex = position751, tokenIndex751 + goto l763 + l762: + position, tokenIndex = position762, tokenIndex762 } - l752: + l763: goto l740 l739: position, tokenIndex = position739, tokenIndex739 @@ -5825,15 +5939,15 @@ func (p *Asm) Init() { } position++ { - position755, tokenIndex755 := position, tokenIndex + position766, tokenIndex766 := position, tokenIndex if !_rules[ruleARMPostincrement]() { - goto l755 + goto l766 } - goto l756 - l755: - position, tokenIndex = position755, tokenIndex755 + goto l767 + l766: + position, tokenIndex = position766, tokenIndex766 } - l756: + l767: add(ruleARMBaseIndexScale, position738) } return true @@ -5843,566 +5957,567 @@ func (p *Asm) Init() { }, /* 47 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */ func() bool { - position757, tokenIndex757 := position, tokenIndex + position768, tokenIndex768 := position, tokenIndex { - position758 := position + position769 := position if buffer[position] != rune(':') { - goto l757 + goto l768 } position++ { - position759, tokenIndex759 := position, tokenIndex + position770, tokenIndex770 := position, tokenIndex if buffer[position] != rune('g') { - goto l760 + goto l771 } position++ - goto l759 - l760: - position, tokenIndex = position759, tokenIndex759 + goto l770 + l771: + position, tokenIndex = position770, tokenIndex770 if buffer[position] != rune('G') { - goto l757 + goto l768 } position++ } - l759: + l770: { - position761, tokenIndex761 := position, tokenIndex + position772, tokenIndex772 := position, tokenIndex if buffer[position] != rune('o') { - goto l762 + goto l773 } position++ - goto l761 - l762: - position, tokenIndex = position761, tokenIndex761 + goto l772 + l773: + position, tokenIndex = position772, tokenIndex772 if buffer[position] != rune('O') { - goto l757 + goto l768 } position++ } - l761: + l772: { - position763, tokenIndex763 := position, tokenIndex + position774, tokenIndex774 := position, tokenIndex if buffer[position] != rune('t') { - goto l764 + goto l775 } position++ - goto l763 - l764: - position, tokenIndex = position763, tokenIndex763 + goto l774 + l775: + position, tokenIndex = position774, tokenIndex774 if buffer[position] != rune('T') { - goto l757 + goto l768 } position++ } - l763: + l774: if buffer[position] != rune('_') { - goto l757 + goto l768 } position++ { - position765, tokenIndex765 := position, tokenIndex + position776, tokenIndex776 := position, tokenIndex if buffer[position] != rune('l') { - goto l766 + goto l777 } position++ - goto l765 - l766: - position, tokenIndex = position765, tokenIndex765 + goto l776 + l777: + position, tokenIndex = position776, tokenIndex776 if buffer[position] != rune('L') { - goto l757 + goto l768 } position++ } - l765: + l776: { - position767, tokenIndex767 := position, tokenIndex + position778, tokenIndex778 := position, tokenIndex if buffer[position] != rune('o') { - goto l768 + goto l779 } position++ - goto l767 - l768: - position, tokenIndex = position767, tokenIndex767 + goto l778 + l779: + position, tokenIndex = position778, tokenIndex778 if buffer[position] != rune('O') { - goto l757 + goto l768 } position++ } - l767: + l778: if buffer[position] != rune('1') { - goto l757 + goto l768 } position++ if buffer[position] != rune('2') { - goto l757 + goto l768 } position++ if buffer[position] != rune(':') { - goto l757 + goto l768 } position++ if !_rules[ruleSymbolName]() { - goto l757 + goto l768 } - add(ruleARMGOTLow12, position758) + add(ruleARMGOTLow12, position769) } return true - l757: - position, tokenIndex = position757, tokenIndex757 + l768: + position, tokenIndex = position768, tokenIndex768 return false }, /* 48 ARMPostincrement <- <'!'> */ func() bool { - position769, tokenIndex769 := position, tokenIndex + position780, tokenIndex780 := position, tokenIndex { - position770 := position + position781 := position if buffer[position] != rune('!') { - goto l769 + goto l780 } position++ - add(ruleARMPostincrement, position770) + add(ruleARMPostincrement, position781) } return true - l769: - position, tokenIndex = position769, tokenIndex769 + l780: + position, tokenIndex = position780, tokenIndex780 return false }, /* 49 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */ func() bool { - position771, tokenIndex771 := position, tokenIndex + position782, tokenIndex782 := position, tokenIndex { - position772 := position + position783 := position if buffer[position] != rune('(') { - goto l771 + goto l782 } position++ { - position773, tokenIndex773 := position, tokenIndex + position784, tokenIndex784 := position, tokenIndex if !_rules[ruleRegisterOrConstant]() { - goto l773 + goto l784 } - goto l774 - l773: - position, tokenIndex = position773, tokenIndex773 + goto l785 + l784: + position, tokenIndex = position784, tokenIndex784 } - l774: + l785: { - position775, tokenIndex775 := position, tokenIndex + position786, tokenIndex786 := position, tokenIndex if !_rules[ruleWS]() { - goto l775 + goto l786 } - goto l776 - l775: - position, tokenIndex = position775, tokenIndex775 + goto l787 + l786: + position, tokenIndex = position786, tokenIndex786 } - l776: + l787: { - position777, tokenIndex777 := position, tokenIndex + position788, tokenIndex788 := position, tokenIndex if buffer[position] != rune(',') { - goto l777 + goto l788 } position++ { - position779, tokenIndex779 := position, tokenIndex + position790, tokenIndex790 := position, tokenIndex if !_rules[ruleWS]() { - goto l779 + goto l790 } - goto l780 - l779: - position, tokenIndex = position779, tokenIndex779 + goto l791 + l790: + position, tokenIndex = position790, tokenIndex790 } - l780: + l791: if !_rules[ruleRegisterOrConstant]() { - goto l777 + goto l788 } { - position781, tokenIndex781 := position, tokenIndex + position792, tokenIndex792 := position, tokenIndex if !_rules[ruleWS]() { - goto l781 + goto l792 } - goto l782 - l781: - position, tokenIndex = position781, tokenIndex781 + goto l793 + l792: + position, tokenIndex = position792, tokenIndex792 } - l782: + l793: { - position783, tokenIndex783 := position, tokenIndex + position794, tokenIndex794 := position, tokenIndex if buffer[position] != rune(',') { - goto l783 + goto l794 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l783 + goto l794 } position++ - l785: + l796: { - position786, tokenIndex786 := position, tokenIndex + position797, tokenIndex797 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l786 + goto l797 } position++ - goto l785 - l786: - position, tokenIndex = position786, tokenIndex786 + goto l796 + l797: + position, tokenIndex = position797, tokenIndex797 } - goto l784 - l783: - position, tokenIndex = position783, tokenIndex783 + goto l795 + l794: + position, tokenIndex = position794, tokenIndex794 } - l784: - goto l778 - l777: - position, tokenIndex = position777, tokenIndex777 + l795: + goto l789 + l788: + position, tokenIndex = position788, tokenIndex788 } - l778: + l789: if buffer[position] != rune(')') { - goto l771 + goto l782 } position++ - add(ruleBaseIndexScale, position772) + add(ruleBaseIndexScale, position783) } return true - l771: - position, tokenIndex = position771, tokenIndex771 + l782: + position, tokenIndex = position782, tokenIndex782 return false }, /* 50 Operator <- <('+' / '-')> */ func() bool { - position787, tokenIndex787 := position, tokenIndex + position798, tokenIndex798 := position, tokenIndex { - position788 := position + position799 := position { - position789, tokenIndex789 := position, tokenIndex + position800, tokenIndex800 := position, tokenIndex if buffer[position] != rune('+') { - goto l790 + goto l801 } position++ - goto l789 - l790: - position, tokenIndex = position789, tokenIndex789 + goto l800 + l801: + position, tokenIndex = position800, tokenIndex800 if buffer[position] != rune('-') { - goto l787 + goto l798 } position++ } - l789: - add(ruleOperator, position788) + l800: + add(ruleOperator, position799) } return true - l787: - position, tokenIndex = position787, tokenIndex787 + l798: + position, tokenIndex = position798, tokenIndex798 return false }, /* 51 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / [0-9]+))> */ func() bool { - position791, tokenIndex791 := position, tokenIndex + position802, tokenIndex802 := position, tokenIndex { - position792 := position + position803 := position { - position793, tokenIndex793 := position, tokenIndex + position804, tokenIndex804 := position, tokenIndex if buffer[position] != rune('+') { - goto l793 + goto l804 } position++ - goto l794 - l793: - position, tokenIndex = position793, tokenIndex793 + goto l805 + l804: + position, tokenIndex = position804, tokenIndex804 } - l794: + l805: { - position795, tokenIndex795 := position, tokenIndex + position806, tokenIndex806 := position, tokenIndex if buffer[position] != rune('-') { - goto l795 + goto l806 } position++ - goto l796 - l795: - position, tokenIndex = position795, tokenIndex795 + goto l807 + l806: + position, tokenIndex = position806, tokenIndex806 } - l796: + l807: { - position797, tokenIndex797 := position, tokenIndex + position808, tokenIndex808 := position, tokenIndex if buffer[position] != rune('0') { - goto l798 + goto l809 } position++ { - position799, tokenIndex799 := position, tokenIndex + position810, tokenIndex810 := position, tokenIndex if buffer[position] != rune('b') { - goto l800 + goto l811 } position++ - goto l799 - l800: - position, tokenIndex = position799, tokenIndex799 + goto l810 + l811: + position, tokenIndex = position810, tokenIndex810 if buffer[position] != rune('B') { - goto l798 + goto l809 } position++ } - l799: + l810: { - position803, tokenIndex803 := position, tokenIndex + position814, tokenIndex814 := position, tokenIndex if buffer[position] != rune('0') { - goto l804 + goto l815 } position++ - goto l803 - l804: - position, tokenIndex = position803, tokenIndex803 + goto l814 + l815: + position, tokenIndex = position814, tokenIndex814 if buffer[position] != rune('1') { - goto l798 + goto l809 } position++ } - l803: - l801: + l814: + l812: { - position802, tokenIndex802 := position, tokenIndex + position813, tokenIndex813 := position, tokenIndex { - position805, tokenIndex805 := position, tokenIndex + position816, tokenIndex816 := position, tokenIndex if buffer[position] != rune('0') { - goto l806 + goto l817 } position++ - goto l805 - l806: - position, tokenIndex = position805, tokenIndex805 + goto l816 + l817: + position, tokenIndex = position816, tokenIndex816 if buffer[position] != rune('1') { - goto l802 + goto l813 } position++ } - l805: - goto l801 - l802: - position, tokenIndex = position802, tokenIndex802 + l816: + goto l812 + l813: + position, tokenIndex = position813, tokenIndex813 } - goto l797 - l798: - position, tokenIndex = position797, tokenIndex797 + goto l808 + l809: + position, tokenIndex = position808, tokenIndex808 if buffer[position] != rune('0') { - goto l807 + goto l818 } position++ { - position808, tokenIndex808 := position, tokenIndex + position819, tokenIndex819 := position, tokenIndex if buffer[position] != rune('x') { - goto l809 + goto l820 } position++ - goto l808 - l809: - position, tokenIndex = position808, tokenIndex808 + goto l819 + l820: + position, tokenIndex = position819, tokenIndex819 if buffer[position] != rune('X') { - goto l807 + goto l818 } position++ } - l808: + l819: { - position812, tokenIndex812 := position, tokenIndex + position823, tokenIndex823 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l813 + goto l824 } position++ - goto l812 - l813: - position, tokenIndex = position812, tokenIndex812 + goto l823 + l824: + position, tokenIndex = position823, tokenIndex823 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l814 + goto l825 } position++ - goto l812 - l814: - position, tokenIndex = position812, tokenIndex812 + goto l823 + l825: + position, tokenIndex = position823, tokenIndex823 { - position815, tokenIndex815 := position, tokenIndex + position826, tokenIndex826 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('f') { - goto l816 + goto l827 } position++ - goto l815 - l816: - position, tokenIndex = position815, tokenIndex815 + goto l826 + l827: + position, tokenIndex = position826, tokenIndex826 if c := buffer[position]; c < rune('A') || c > rune('F') { - goto l807 + goto l818 } position++ } - l815: + l826: } - l812: - l810: + l823: + l821: { - position811, tokenIndex811 := position, tokenIndex + position822, tokenIndex822 := position, tokenIndex { - position817, tokenIndex817 := position, tokenIndex + position828, tokenIndex828 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l818 + goto l829 } position++ - goto l817 - l818: - position, tokenIndex = position817, tokenIndex817 + goto l828 + l829: + position, tokenIndex = position828, tokenIndex828 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l819 + goto l830 } position++ - goto l817 - l819: - position, tokenIndex = position817, tokenIndex817 + goto l828 + l830: + position, tokenIndex = position828, tokenIndex828 { - position820, tokenIndex820 := position, tokenIndex + position831, tokenIndex831 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('f') { - goto l821 + goto l832 } position++ - goto l820 - l821: - position, tokenIndex = position820, tokenIndex820 + goto l831 + l832: + position, tokenIndex = position831, tokenIndex831 if c := buffer[position]; c < rune('A') || c > rune('F') { - goto l811 + goto l822 } position++ } - l820: + l831: } - l817: - goto l810 - l811: - position, tokenIndex = position811, tokenIndex811 + l828: + goto l821 + l822: + position, tokenIndex = position822, tokenIndex822 } - goto l797 - l807: - position, tokenIndex = position797, tokenIndex797 + goto l808 + l818: + position, tokenIndex = position808, tokenIndex808 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l791 + goto l802 } position++ - l822: + l833: { - position823, tokenIndex823 := position, tokenIndex + position834, tokenIndex834 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l823 + goto l834 } position++ - goto l822 - l823: - position, tokenIndex = position823, tokenIndex823 + goto l833 + l834: + position, tokenIndex = position834, tokenIndex834 } } - l797: - add(ruleOffset, position792) + l808: + add(ruleOffset, position803) } return true - l791: - position, tokenIndex = position791, tokenIndex791 + l802: + position, tokenIndex = position802, tokenIndex802 return false }, /* 52 Section <- <([a-z] / [A-Z] / '@')+> */ func() bool { - position824, tokenIndex824 := position, tokenIndex + position835, tokenIndex835 := position, tokenIndex { - position825 := position + position836 := position { - position828, tokenIndex828 := position, tokenIndex + position839, tokenIndex839 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l829 + goto l840 } position++ - goto l828 - l829: - position, tokenIndex = position828, tokenIndex828 + goto l839 + l840: + position, tokenIndex = position839, tokenIndex839 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l830 + goto l841 } position++ - goto l828 - l830: - position, tokenIndex = position828, tokenIndex828 + goto l839 + l841: + position, tokenIndex = position839, tokenIndex839 if buffer[position] != rune('@') { - goto l824 + goto l835 } position++ } - l828: - l826: + l839: + l837: { - position827, tokenIndex827 := position, tokenIndex + position838, tokenIndex838 := position, tokenIndex { - position831, tokenIndex831 := position, tokenIndex + position842, tokenIndex842 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l832 + goto l843 } position++ - goto l831 - l832: - position, tokenIndex = position831, tokenIndex831 + goto l842 + l843: + position, tokenIndex = position842, tokenIndex842 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l833 + goto l844 } position++ - goto l831 - l833: - position, tokenIndex = position831, tokenIndex831 + goto l842 + l844: + position, tokenIndex = position842, tokenIndex842 if buffer[position] != rune('@') { - goto l827 + goto l838 } position++ } - l831: - goto l826 - l827: - position, tokenIndex = position827, tokenIndex827 + l842: + goto l837 + l838: + position, tokenIndex = position838, tokenIndex838 } - add(ruleSection, position825) + add(ruleSection, position836) } return true - l824: - position, tokenIndex = position824, tokenIndex824 + l835: + position, tokenIndex = position835, tokenIndex835 return false }, /* 53 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */ func() bool { - position834, tokenIndex834 := position, tokenIndex + position845, tokenIndex845 := position, tokenIndex { - position835 := position + position846 := position if buffer[position] != rune('%') { - goto l834 + goto l845 } position++ { - position836, tokenIndex836 := position, tokenIndex + position847, tokenIndex847 := position, tokenIndex if c := buffer[position]; c < rune('c') || c > rune('g') { - goto l837 + goto l848 } position++ - goto l836 - l837: - position, tokenIndex = position836, tokenIndex836 + goto l847 + l848: + position, tokenIndex = position847, tokenIndex847 if buffer[position] != rune('s') { - goto l834 + goto l845 } position++ } - l836: + l847: if buffer[position] != rune('s') { - goto l834 + goto l845 } position++ if buffer[position] != rune(':') { - goto l834 + goto l845 } position++ - add(ruleSegmentRegister, position835) + add(ruleSegmentRegister, position846) } return true - l834: - position, tokenIndex = position834, tokenIndex834 + l845: + position, tokenIndex = position845, tokenIndex845 return false }, } p.rules = _rules + return nil }