Mirror of BoringSSL (grpc依赖)
https://boringssl.googlesource.com/boringssl
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3010 lines
73 KiB
3010 lines
73 KiB
8 months ago
|
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||
|
// source tree. Do not edit by hand.
|
||
|
|
||
|
#include <openssl/asm_base.h>
|
||
|
|
||
|
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
|
||
|
#include <openssl/arm_arch.h>
|
||
|
.section .rodata
|
||
|
|
||
|
.align 7
|
||
|
.Lchacha20_consts:
|
||
|
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
|
||
|
.Linc:
|
||
|
.long 1,2,3,4
|
||
|
.Lrol8:
|
||
|
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
|
||
|
.Lclamp:
|
||
|
.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
|
||
|
|
||
|
.text
|
||
|
|
||
|
.type .Lpoly_hash_ad_internal,%function
|
||
|
.align 6
|
||
|
.Lpoly_hash_ad_internal:
|
||
|
.cfi_startproc
|
||
|
cbnz x4, .Lpoly_hash_intro
|
||
|
ret
|
||
|
|
||
|
.Lpoly_hash_intro:
|
||
|
cmp x4, #16
|
||
|
b.lt .Lpoly_hash_ad_tail
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
sub x4, x4, #16
|
||
|
b .Lpoly_hash_ad_internal
|
||
|
|
||
|
.Lpoly_hash_ad_tail:
|
||
|
cbz x4, .Lpoly_hash_ad_ret
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
|
||
|
sub x4, x4, #1
|
||
|
|
||
|
.Lpoly_hash_tail_16_compose:
|
||
|
ext v20.16b, v20.16b, v20.16b, #15
|
||
|
ldrb w11, [x3, x4]
|
||
|
mov v20.b[0], w11
|
||
|
subs x4, x4, #1
|
||
|
b.ge .Lpoly_hash_tail_16_compose
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
|
||
|
.Lpoly_hash_ad_ret:
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
|
||
|
|
||
|
/////////////////////////////////
|
||
|
//
|
||
|
// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
|
||
|
//
|
||
|
.globl chacha20_poly1305_seal
|
||
|
.hidden chacha20_poly1305_seal
|
||
|
.type chacha20_poly1305_seal,%function
|
||
|
.align 6
|
||
|
chacha20_poly1305_seal:
|
||
|
AARCH64_SIGN_LINK_REGISTER
|
||
|
.cfi_startproc
|
||
|
stp x29, x30, [sp, #-80]!
|
||
|
.cfi_def_cfa_offset 80
|
||
|
.cfi_offset w30, -72
|
||
|
.cfi_offset w29, -80
|
||
|
mov x29, sp
|
||
|
// We probably could do .cfi_def_cfa w29, 80 at this point, but since
|
||
|
// we don't actually use the frame pointer like that, it's probably not
|
||
|
// worth bothering.
|
||
|
stp d8, d9, [sp, #16]
|
||
|
stp d10, d11, [sp, #32]
|
||
|
stp d12, d13, [sp, #48]
|
||
|
stp d14, d15, [sp, #64]
|
||
|
.cfi_offset b15, -8
|
||
|
.cfi_offset b14, -16
|
||
|
.cfi_offset b13, -24
|
||
|
.cfi_offset b12, -32
|
||
|
.cfi_offset b11, -40
|
||
|
.cfi_offset b10, -48
|
||
|
.cfi_offset b9, -56
|
||
|
.cfi_offset b8, -64
|
||
|
|
||
|
adrp x11, .Lchacha20_consts
|
||
|
add x11, x11, :lo12:.Lchacha20_consts
|
||
|
|
||
|
ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
|
||
|
ld1 {v28.16b - v30.16b}, [x5]
|
||
|
|
||
|
mov x15, #1 // Prepare the Poly1305 state
|
||
|
mov x8, #0
|
||
|
mov x9, #0
|
||
|
mov x10, #0
|
||
|
|
||
|
ldr x12, [x5, #56] // The total cipher text length includes extra_in_len
|
||
|
add x12, x12, x2
|
||
|
mov v31.d[0], x4 // Store the input and aad lengths
|
||
|
mov v31.d[1], x12
|
||
|
|
||
|
cmp x2, #128
|
||
|
b.le .Lseal_128 // Optimization for smaller buffers
|
||
|
|
||
|
// Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
|
||
|
// and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
|
||
|
// the fifth block (A4-D4) horizontally.
|
||
|
ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
|
||
|
mov v4.16b, v24.16b
|
||
|
|
||
|
ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
|
||
|
mov v9.16b, v28.16b
|
||
|
|
||
|
ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
|
||
|
mov v14.16b, v29.16b
|
||
|
|
||
|
ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov v19.16b, v30.16b
|
||
|
|
||
|
sub x5, x5, #32
|
||
|
|
||
|
mov x6, #10
|
||
|
|
||
|
.align 5
|
||
|
.Lseal_init_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
eor v8.16b, v8.16b, v13.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v9.4s, #20
|
||
|
sli v8.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
add v3.4s, v3.4s, v7.4s
|
||
|
add v4.4s, v4.4s, v8.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v8.4s, #25
|
||
|
sli v9.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #4
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #12
|
||
|
add v0.4s, v0.4s, v6.4s
|
||
|
add v1.4s, v1.4s, v7.4s
|
||
|
add v2.4s, v2.4s, v8.4s
|
||
|
add v3.4s, v3.4s, v5.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v6.4s, #20
|
||
|
sli v20.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v5.4s, #20
|
||
|
sli v8.4s, v5.4s, #12
|
||
|
ushr v5.4s, v9.4s, #20
|
||
|
sli v5.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v5.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v12.16b
|
||
|
eor v6.16b, v6.16b, v13.16b
|
||
|
eor v7.16b, v7.16b, v10.16b
|
||
|
eor v8.16b, v8.16b, v11.16b
|
||
|
eor v5.16b, v5.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v5.4s, #25
|
||
|
sli v9.4s, v5.4s, #7
|
||
|
ushr v5.4s, v8.4s, #25
|
||
|
sli v5.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v20.4s, #25
|
||
|
sli v6.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #12
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.hi .Lseal_init_rounds
|
||
|
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov x11, #4
|
||
|
dup v20.4s, w11
|
||
|
add v25.4s, v25.4s, v20.4s
|
||
|
|
||
|
zip1 v20.4s, v0.4s, v1.4s
|
||
|
zip2 v21.4s, v0.4s, v1.4s
|
||
|
zip1 v22.4s, v2.4s, v3.4s
|
||
|
zip2 v23.4s, v2.4s, v3.4s
|
||
|
|
||
|
zip1 v0.2d, v20.2d, v22.2d
|
||
|
zip2 v1.2d, v20.2d, v22.2d
|
||
|
zip1 v2.2d, v21.2d, v23.2d
|
||
|
zip2 v3.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v5.4s, v6.4s
|
||
|
zip2 v21.4s, v5.4s, v6.4s
|
||
|
zip1 v22.4s, v7.4s, v8.4s
|
||
|
zip2 v23.4s, v7.4s, v8.4s
|
||
|
|
||
|
zip1 v5.2d, v20.2d, v22.2d
|
||
|
zip2 v6.2d, v20.2d, v22.2d
|
||
|
zip1 v7.2d, v21.2d, v23.2d
|
||
|
zip2 v8.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v10.4s, v11.4s
|
||
|
zip2 v21.4s, v10.4s, v11.4s
|
||
|
zip1 v22.4s, v12.4s, v13.4s
|
||
|
zip2 v23.4s, v12.4s, v13.4s
|
||
|
|
||
|
zip1 v10.2d, v20.2d, v22.2d
|
||
|
zip2 v11.2d, v20.2d, v22.2d
|
||
|
zip1 v12.2d, v21.2d, v23.2d
|
||
|
zip2 v13.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v15.4s, v16.4s
|
||
|
zip2 v21.4s, v15.4s, v16.4s
|
||
|
zip1 v22.4s, v17.4s, v18.4s
|
||
|
zip2 v23.4s, v17.4s, v18.4s
|
||
|
|
||
|
zip1 v15.2d, v20.2d, v22.2d
|
||
|
zip2 v16.2d, v20.2d, v22.2d
|
||
|
zip1 v17.2d, v21.2d, v23.2d
|
||
|
zip2 v18.2d, v21.2d, v23.2d
|
||
|
|
||
|
add v4.4s, v4.4s, v24.4s
|
||
|
add v9.4s, v9.4s, v28.4s
|
||
|
and v4.16b, v4.16b, v27.16b
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
add v12.4s, v12.4s, v29.4s
|
||
|
add v17.4s, v17.4s, v30.4s
|
||
|
|
||
|
add v3.4s, v3.4s, v24.4s
|
||
|
add v8.4s, v8.4s, v28.4s
|
||
|
add v13.4s, v13.4s, v29.4s
|
||
|
add v18.4s, v18.4s, v30.4s
|
||
|
|
||
|
mov x16, v4.d[0] // Move the R key to GPRs
|
||
|
mov x17, v4.d[1]
|
||
|
mov v27.16b, v9.16b // Store the S key
|
||
|
|
||
|
bl .Lpoly_hash_ad_internal
|
||
|
|
||
|
mov x3, x0
|
||
|
cmp x2, #256
|
||
|
b.le .Lseal_tail
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
eor v21.16b, v21.16b, v5.16b
|
||
|
eor v22.16b, v22.16b, v10.16b
|
||
|
eor v23.16b, v23.16b, v15.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v1.16b
|
||
|
eor v21.16b, v21.16b, v6.16b
|
||
|
eor v22.16b, v22.16b, v11.16b
|
||
|
eor v23.16b, v23.16b, v16.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v2.16b
|
||
|
eor v21.16b, v21.16b, v7.16b
|
||
|
eor v22.16b, v22.16b, v12.16b
|
||
|
eor v23.16b, v23.16b, v17.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v3.16b
|
||
|
eor v21.16b, v21.16b, v8.16b
|
||
|
eor v22.16b, v22.16b, v13.16b
|
||
|
eor v23.16b, v23.16b, v18.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #256
|
||
|
|
||
|
mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
|
||
|
mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
|
||
|
|
||
|
.Lseal_main_loop:
|
||
|
adrp x11, .Lchacha20_consts
|
||
|
add x11, x11, :lo12:.Lchacha20_consts
|
||
|
|
||
|
ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
|
||
|
mov v4.16b, v24.16b
|
||
|
|
||
|
ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
|
||
|
mov v9.16b, v28.16b
|
||
|
|
||
|
ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
|
||
|
mov v14.16b, v29.16b
|
||
|
|
||
|
ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov v19.16b, v30.16b
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b //zero
|
||
|
not v21.16b, v20.16b // -1
|
||
|
sub v21.4s, v25.4s, v21.4s // Add +1
|
||
|
ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
|
||
|
add v19.4s, v19.4s, v20.4s
|
||
|
|
||
|
sub x5, x5, #32
|
||
|
.align 5
|
||
|
.Lseal_main_loop_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
eor v8.16b, v8.16b, v13.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v9.4s, #20
|
||
|
sli v8.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
add v3.4s, v3.4s, v7.4s
|
||
|
add v4.4s, v4.4s, v8.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v8.4s, #25
|
||
|
sli v9.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #4
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #12
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
add v0.4s, v0.4s, v6.4s
|
||
|
add v1.4s, v1.4s, v7.4s
|
||
|
add v2.4s, v2.4s, v8.4s
|
||
|
add v3.4s, v3.4s, v5.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v6.4s, #20
|
||
|
sli v20.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v5.4s, #20
|
||
|
sli v8.4s, v5.4s, #12
|
||
|
ushr v5.4s, v9.4s, #20
|
||
|
sli v5.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v5.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v12.16b
|
||
|
eor v6.16b, v6.16b, v13.16b
|
||
|
eor v7.16b, v7.16b, v10.16b
|
||
|
eor v8.16b, v8.16b, v11.16b
|
||
|
eor v5.16b, v5.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v5.4s, #25
|
||
|
sli v9.4s, v5.4s, #7
|
||
|
ushr v5.4s, v8.4s, #25
|
||
|
sli v5.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v20.4s, #25
|
||
|
sli v6.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #12
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.ge .Lseal_main_loop_rounds
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
subs x7, x7, #1
|
||
|
b.gt .Lseal_main_loop_rounds
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b //zero
|
||
|
not v21.16b, v20.16b // -1
|
||
|
sub v21.4s, v25.4s, v21.4s // Add +1
|
||
|
ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
|
||
|
add v19.4s, v19.4s, v20.4s
|
||
|
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov x11, #5
|
||
|
dup v20.4s, w11
|
||
|
add v25.4s, v25.4s, v20.4s
|
||
|
|
||
|
zip1 v20.4s, v0.4s, v1.4s
|
||
|
zip2 v21.4s, v0.4s, v1.4s
|
||
|
zip1 v22.4s, v2.4s, v3.4s
|
||
|
zip2 v23.4s, v2.4s, v3.4s
|
||
|
|
||
|
zip1 v0.2d, v20.2d, v22.2d
|
||
|
zip2 v1.2d, v20.2d, v22.2d
|
||
|
zip1 v2.2d, v21.2d, v23.2d
|
||
|
zip2 v3.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v5.4s, v6.4s
|
||
|
zip2 v21.4s, v5.4s, v6.4s
|
||
|
zip1 v22.4s, v7.4s, v8.4s
|
||
|
zip2 v23.4s, v7.4s, v8.4s
|
||
|
|
||
|
zip1 v5.2d, v20.2d, v22.2d
|
||
|
zip2 v6.2d, v20.2d, v22.2d
|
||
|
zip1 v7.2d, v21.2d, v23.2d
|
||
|
zip2 v8.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v10.4s, v11.4s
|
||
|
zip2 v21.4s, v10.4s, v11.4s
|
||
|
zip1 v22.4s, v12.4s, v13.4s
|
||
|
zip2 v23.4s, v12.4s, v13.4s
|
||
|
|
||
|
zip1 v10.2d, v20.2d, v22.2d
|
||
|
zip2 v11.2d, v20.2d, v22.2d
|
||
|
zip1 v12.2d, v21.2d, v23.2d
|
||
|
zip2 v13.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v15.4s, v16.4s
|
||
|
zip2 v21.4s, v15.4s, v16.4s
|
||
|
zip1 v22.4s, v17.4s, v18.4s
|
||
|
zip2 v23.4s, v17.4s, v18.4s
|
||
|
|
||
|
zip1 v15.2d, v20.2d, v22.2d
|
||
|
zip2 v16.2d, v20.2d, v22.2d
|
||
|
zip1 v17.2d, v21.2d, v23.2d
|
||
|
zip2 v18.2d, v21.2d, v23.2d
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
add v12.4s, v12.4s, v29.4s
|
||
|
add v17.4s, v17.4s, v30.4s
|
||
|
|
||
|
add v3.4s, v3.4s, v24.4s
|
||
|
add v8.4s, v8.4s, v28.4s
|
||
|
add v13.4s, v13.4s, v29.4s
|
||
|
add v18.4s, v18.4s, v30.4s
|
||
|
|
||
|
add v4.4s, v4.4s, v24.4s
|
||
|
add v9.4s, v9.4s, v28.4s
|
||
|
add v14.4s, v14.4s, v29.4s
|
||
|
add v19.4s, v19.4s, v30.4s
|
||
|
|
||
|
cmp x2, #320
|
||
|
b.le .Lseal_tail
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
eor v21.16b, v21.16b, v5.16b
|
||
|
eor v22.16b, v22.16b, v10.16b
|
||
|
eor v23.16b, v23.16b, v15.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v1.16b
|
||
|
eor v21.16b, v21.16b, v6.16b
|
||
|
eor v22.16b, v22.16b, v11.16b
|
||
|
eor v23.16b, v23.16b, v16.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v2.16b
|
||
|
eor v21.16b, v21.16b, v7.16b
|
||
|
eor v22.16b, v22.16b, v12.16b
|
||
|
eor v23.16b, v23.16b, v17.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v3.16b
|
||
|
eor v21.16b, v21.16b, v8.16b
|
||
|
eor v22.16b, v22.16b, v13.16b
|
||
|
eor v23.16b, v23.16b, v18.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v4.16b
|
||
|
eor v21.16b, v21.16b, v9.16b
|
||
|
eor v22.16b, v22.16b, v14.16b
|
||
|
eor v23.16b, v23.16b, v19.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #320
|
||
|
|
||
|
mov x6, #0
|
||
|
mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
|
||
|
|
||
|
b .Lseal_main_loop
|
||
|
|
||
|
.Lseal_tail:
|
||
|
// This part of the function handles the storage and authentication of the last [0,320) bytes
|
||
|
// We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
|
||
|
cmp x2, #64
|
||
|
b.lt .Lseal_tail_64
|
||
|
|
||
|
// Store and authenticate 64B blocks per iteration
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
eor v21.16b, v21.16b, v5.16b
|
||
|
eor v22.16b, v22.16b, v10.16b
|
||
|
eor v23.16b, v23.16b, v15.16b
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v21.d[0]
|
||
|
mov x12, v21.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v22.d[0]
|
||
|
mov x12, v22.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v23.d[0]
|
||
|
mov x12, v23.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
sub x2, x2, #64
|
||
|
|
||
|
// Shift the state left by 64 bytes for the next iteration of the loop
|
||
|
mov v0.16b, v1.16b
|
||
|
mov v5.16b, v6.16b
|
||
|
mov v10.16b, v11.16b
|
||
|
mov v15.16b, v16.16b
|
||
|
|
||
|
mov v1.16b, v2.16b
|
||
|
mov v6.16b, v7.16b
|
||
|
mov v11.16b, v12.16b
|
||
|
mov v16.16b, v17.16b
|
||
|
|
||
|
mov v2.16b, v3.16b
|
||
|
mov v7.16b, v8.16b
|
||
|
mov v12.16b, v13.16b
|
||
|
mov v17.16b, v18.16b
|
||
|
|
||
|
mov v3.16b, v4.16b
|
||
|
mov v8.16b, v9.16b
|
||
|
mov v13.16b, v14.16b
|
||
|
mov v18.16b, v19.16b
|
||
|
|
||
|
b .Lseal_tail
|
||
|
|
||
|
.Lseal_tail_64:
|
||
|
ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
|
||
|
|
||
|
// Here we handle the last [0,64) bytes of plaintext
|
||
|
cmp x2, #16
|
||
|
b.lt .Lseal_tail_16
|
||
|
// Each iteration encrypt and authenticate a 16B block
|
||
|
ld1 {v20.16b}, [x1], #16
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
st1 {v20.16b}, [x0], #16
|
||
|
|
||
|
sub x2, x2, #16
|
||
|
|
||
|
// Shift the state left by 16 bytes for the next iteration of the loop
|
||
|
mov v0.16b, v5.16b
|
||
|
mov v5.16b, v10.16b
|
||
|
mov v10.16b, v15.16b
|
||
|
|
||
|
b .Lseal_tail_64
|
||
|
|
||
|
.Lseal_tail_16:
|
||
|
// Here we handle the last [0,16) bytes of ciphertext that require a padded block
|
||
|
cbz x2, .Lseal_hash_extra
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
|
||
|
eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
|
||
|
not v22.16b, v20.16b
|
||
|
|
||
|
mov x6, x2
|
||
|
add x1, x1, x2
|
||
|
|
||
|
cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
|
||
|
|
||
|
mov x7, #16 // We need to load some extra_in first for padding
|
||
|
sub x7, x7, x2
|
||
|
cmp x4, x7
|
||
|
csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
|
||
|
mov x12, x7
|
||
|
add x3, x3, x7
|
||
|
sub x4, x4, x7
|
||
|
|
||
|
.Lseal_tail16_compose_extra_in:
|
||
|
ext v20.16b, v20.16b, v20.16b, #15
|
||
|
ldrb w11, [x3, #-1]!
|
||
|
mov v20.b[0], w11
|
||
|
subs x7, x7, #1
|
||
|
b.gt .Lseal_tail16_compose_extra_in
|
||
|
|
||
|
add x3, x3, x12
|
||
|
|
||
|
.Lseal_tail_16_compose:
|
||
|
ext v20.16b, v20.16b, v20.16b, #15
|
||
|
ldrb w11, [x1, #-1]!
|
||
|
mov v20.b[0], w11
|
||
|
ext v21.16b, v22.16b, v21.16b, #15
|
||
|
subs x2, x2, #1
|
||
|
b.gt .Lseal_tail_16_compose
|
||
|
|
||
|
and v0.16b, v0.16b, v21.16b
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
mov v21.16b, v20.16b
|
||
|
|
||
|
.Lseal_tail_16_store:
|
||
|
umov w11, v20.b[0]
|
||
|
strb w11, [x0], #1
|
||
|
ext v20.16b, v20.16b, v20.16b, #1
|
||
|
subs x6, x6, #1
|
||
|
b.gt .Lseal_tail_16_store
|
||
|
|
||
|
// Hash in the final ct block concatenated with extra_in
|
||
|
mov x11, v21.d[0]
|
||
|
mov x12, v21.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
|
||
|
.Lseal_hash_extra:
|
||
|
cbz x4, .Lseal_finalize
|
||
|
|
||
|
.Lseal_hash_extra_loop:
|
||
|
cmp x4, #16
|
||
|
b.lt .Lseal_hash_extra_tail
|
||
|
ld1 {v20.16b}, [x3], #16
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
sub x4, x4, #16
|
||
|
b .Lseal_hash_extra_loop
|
||
|
|
||
|
.Lseal_hash_extra_tail:
|
||
|
cbz x4, .Lseal_finalize
|
||
|
eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
|
||
|
add x3, x3, x4
|
||
|
|
||
|
.Lseal_hash_extra_load:
|
||
|
ext v20.16b, v20.16b, v20.16b, #15
|
||
|
ldrb w11, [x3, #-1]!
|
||
|
mov v20.b[0], w11
|
||
|
subs x4, x4, #1
|
||
|
b.gt .Lseal_hash_extra_load
|
||
|
|
||
|
// Hash in the final padded extra_in blcok
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
|
||
|
.Lseal_finalize:
|
||
|
mov x11, v31.d[0]
|
||
|
mov x12, v31.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
// Final reduction step
|
||
|
sub x12, xzr, x15
|
||
|
orr x13, xzr, #3
|
||
|
subs x11, x8, #-5
|
||
|
sbcs x12, x9, x12
|
||
|
sbcs x13, x10, x13
|
||
|
csel x8, x11, x8, cs
|
||
|
csel x9, x12, x9, cs
|
||
|
csel x10, x13, x10, cs
|
||
|
mov x11, v27.d[0]
|
||
|
mov x12, v27.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
|
||
|
stp x8, x9, [x5]
|
||
|
|
||
|
ldp d8, d9, [sp, #16]
|
||
|
ldp d10, d11, [sp, #32]
|
||
|
ldp d12, d13, [sp, #48]
|
||
|
ldp d14, d15, [sp, #64]
|
||
|
.cfi_restore b15
|
||
|
.cfi_restore b14
|
||
|
.cfi_restore b13
|
||
|
.cfi_restore b12
|
||
|
.cfi_restore b11
|
||
|
.cfi_restore b10
|
||
|
.cfi_restore b9
|
||
|
.cfi_restore b8
|
||
|
ldp x29, x30, [sp], 80
|
||
|
.cfi_restore w29
|
||
|
.cfi_restore w30
|
||
|
.cfi_def_cfa_offset 0
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
.Lseal_128:
|
||
|
// On some architectures preparing 5 blocks for small buffers is wasteful
|
||
|
eor v25.16b, v25.16b, v25.16b
|
||
|
mov x11, #1
|
||
|
mov v25.s[0], w11
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v1.16b, v24.16b
|
||
|
mov v2.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v6.16b, v28.16b
|
||
|
mov v7.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v11.16b, v29.16b
|
||
|
mov v12.16b, v29.16b
|
||
|
mov v17.16b, v30.16b
|
||
|
add v15.4s, v17.4s, v25.4s
|
||
|
add v16.4s, v15.4s, v25.4s
|
||
|
|
||
|
mov x6, #10
|
||
|
|
||
|
.Lseal_128_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v6.16b, v6.16b, v6.16b, #4
|
||
|
ext v7.16b, v7.16b, v7.16b, #4
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
ext v16.16b, v16.16b, v16.16b, #12
|
||
|
ext v17.16b, v17.16b, v17.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v6.16b, v6.16b, v6.16b, #12
|
||
|
ext v7.16b, v7.16b, v7.16b, #12
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
ext v16.16b, v16.16b, v16.16b, #4
|
||
|
ext v17.16b, v17.16b, v17.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.hi .Lseal_128_rounds
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
|
||
|
// Only the first 32 bytes of the third block (counter = 0) are needed,
|
||
|
// so skip updating v12 and v17.
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
|
||
|
add v30.4s, v30.4s, v25.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
add v30.4s, v30.4s, v25.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
|
||
|
and v2.16b, v2.16b, v27.16b
|
||
|
mov x16, v2.d[0] // Move the R key to GPRs
|
||
|
mov x17, v2.d[1]
|
||
|
mov v27.16b, v7.16b // Store the S key
|
||
|
|
||
|
bl .Lpoly_hash_ad_internal
|
||
|
b .Lseal_tail
|
||
|
.cfi_endproc
|
||
|
.size chacha20_poly1305_seal,.-chacha20_poly1305_seal
|
||
|
|
||
|
/////////////////////////////////
|
||
|
//
|
||
|
// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
|
||
|
//
|
||
|
.globl chacha20_poly1305_open
|
||
|
.hidden chacha20_poly1305_open
|
||
|
.type chacha20_poly1305_open,%function
|
||
|
.align 6
|
||
|
chacha20_poly1305_open:
|
||
|
AARCH64_SIGN_LINK_REGISTER
|
||
|
.cfi_startproc
|
||
|
stp x29, x30, [sp, #-80]!
|
||
|
.cfi_def_cfa_offset 80
|
||
|
.cfi_offset w30, -72
|
||
|
.cfi_offset w29, -80
|
||
|
mov x29, sp
|
||
|
// We probably could do .cfi_def_cfa w29, 80 at this point, but since
|
||
|
// we don't actually use the frame pointer like that, it's probably not
|
||
|
// worth bothering.
|
||
|
stp d8, d9, [sp, #16]
|
||
|
stp d10, d11, [sp, #32]
|
||
|
stp d12, d13, [sp, #48]
|
||
|
stp d14, d15, [sp, #64]
|
||
|
.cfi_offset b15, -8
|
||
|
.cfi_offset b14, -16
|
||
|
.cfi_offset b13, -24
|
||
|
.cfi_offset b12, -32
|
||
|
.cfi_offset b11, -40
|
||
|
.cfi_offset b10, -48
|
||
|
.cfi_offset b9, -56
|
||
|
.cfi_offset b8, -64
|
||
|
|
||
|
adrp x11, .Lchacha20_consts
|
||
|
add x11, x11, :lo12:.Lchacha20_consts
|
||
|
|
||
|
ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
|
||
|
ld1 {v28.16b - v30.16b}, [x5]
|
||
|
|
||
|
mov x15, #1 // Prepare the Poly1305 state
|
||
|
mov x8, #0
|
||
|
mov x9, #0
|
||
|
mov x10, #0
|
||
|
|
||
|
mov v31.d[0], x4 // Store the input and aad lengths
|
||
|
mov v31.d[1], x2
|
||
|
|
||
|
cmp x2, #128
|
||
|
b.le .Lopen_128 // Optimization for smaller buffers
|
||
|
|
||
|
// Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v15.16b, v30.16b
|
||
|
|
||
|
mov x6, #10
|
||
|
|
||
|
.align 5
|
||
|
.Lopen_init_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.hi .Lopen_init_rounds
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
|
||
|
and v0.16b, v0.16b, v27.16b
|
||
|
mov x16, v0.d[0] // Move the R key to GPRs
|
||
|
mov x17, v0.d[1]
|
||
|
mov v27.16b, v5.16b // Store the S key
|
||
|
|
||
|
bl .Lpoly_hash_ad_internal
|
||
|
|
||
|
.Lopen_ad_done:
|
||
|
mov x3, x1
|
||
|
|
||
|
// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
|
||
|
.Lopen_main_loop:
|
||
|
|
||
|
cmp x2, #192
|
||
|
b.lt .Lopen_tail
|
||
|
|
||
|
adrp x11, .Lchacha20_consts
|
||
|
add x11, x11, :lo12:.Lchacha20_consts
|
||
|
|
||
|
ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
|
||
|
mov v4.16b, v24.16b
|
||
|
|
||
|
ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
|
||
|
mov v9.16b, v28.16b
|
||
|
|
||
|
ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
|
||
|
mov v14.16b, v29.16b
|
||
|
|
||
|
ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
|
||
|
sub x5, x5, #32
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov v19.16b, v30.16b
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b //zero
|
||
|
not v21.16b, v20.16b // -1
|
||
|
sub v21.4s, v25.4s, v21.4s // Add +1
|
||
|
ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
|
||
|
add v19.4s, v19.4s, v20.4s
|
||
|
|
||
|
lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
|
||
|
sub x4, x4, #10
|
||
|
|
||
|
mov x7, #10
|
||
|
subs x6, x7, x4
|
||
|
subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
|
||
|
csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
|
||
|
|
||
|
cbz x7, .Lopen_main_loop_rounds_short
|
||
|
|
||
|
.align 5
|
||
|
.Lopen_main_loop_rounds:
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
.Lopen_main_loop_rounds_short:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
eor v8.16b, v8.16b, v13.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v9.4s, #20
|
||
|
sli v8.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
add v3.4s, v3.4s, v7.4s
|
||
|
add v4.4s, v4.4s, v8.4s
|
||
|
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
eor v18.16b, v18.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
add v13.4s, v13.4s, v18.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v8.4s, #25
|
||
|
sli v9.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #4
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #12
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
add v0.4s, v0.4s, v6.4s
|
||
|
add v1.4s, v1.4s, v7.4s
|
||
|
add v2.4s, v2.4s, v8.4s
|
||
|
add v3.4s, v3.4s, v5.4s
|
||
|
add v4.4s, v4.4s, v9.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
rev32 v18.8h, v18.8h
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
rev32 v19.8h, v19.8h
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
eor v7.16b, v7.16b, v13.16b
|
||
|
eor v8.16b, v8.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v9.16b, v9.16b, v14.16b
|
||
|
|
||
|
ushr v20.4s, v6.4s, #20
|
||
|
sli v20.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
ushr v7.4s, v8.4s, #20
|
||
|
sli v7.4s, v8.4s, #12
|
||
|
ushr v8.4s, v5.4s, #20
|
||
|
sli v8.4s, v5.4s, #12
|
||
|
ushr v5.4s, v9.4s, #20
|
||
|
sli v5.4s, v9.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
add v3.4s, v3.4s, v8.4s
|
||
|
add v4.4s, v4.4s, v5.4s
|
||
|
|
||
|
eor v18.16b, v18.16b, v0.16b
|
||
|
eor v15.16b, v15.16b, v1.16b
|
||
|
eor v16.16b, v16.16b, v2.16b
|
||
|
eor v17.16b, v17.16b, v3.16b
|
||
|
eor v19.16b, v19.16b, v4.16b
|
||
|
|
||
|
tbl v18.16b, {v18.16b}, v26.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
tbl v19.16b, {v19.16b}, v26.16b
|
||
|
|
||
|
add v12.4s, v12.4s, v18.4s
|
||
|
add v13.4s, v13.4s, v15.4s
|
||
|
add v10.4s, v10.4s, v16.4s
|
||
|
add v11.4s, v11.4s, v17.4s
|
||
|
add v14.4s, v14.4s, v19.4s
|
||
|
|
||
|
eor v20.16b, v20.16b, v12.16b
|
||
|
eor v6.16b, v6.16b, v13.16b
|
||
|
eor v7.16b, v7.16b, v10.16b
|
||
|
eor v8.16b, v8.16b, v11.16b
|
||
|
eor v5.16b, v5.16b, v14.16b
|
||
|
|
||
|
ushr v9.4s, v5.4s, #25
|
||
|
sli v9.4s, v5.4s, #7
|
||
|
ushr v5.4s, v8.4s, #25
|
||
|
sli v5.4s, v8.4s, #7
|
||
|
ushr v8.4s, v7.4s, #25
|
||
|
sli v8.4s, v7.4s, #7
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v20.4s, #25
|
||
|
sli v6.4s, v20.4s, #7
|
||
|
|
||
|
ext v9.16b, v9.16b, v9.16b, #12
|
||
|
ext v14.16b, v14.16b, v14.16b, #8
|
||
|
ext v19.16b, v19.16b, v19.16b, #4
|
||
|
subs x7, x7, #1
|
||
|
b.gt .Lopen_main_loop_rounds
|
||
|
subs x6, x6, #1
|
||
|
b.ge .Lopen_main_loop_rounds_short
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b //zero
|
||
|
not v21.16b, v20.16b // -1
|
||
|
sub v21.4s, v25.4s, v21.4s // Add +1
|
||
|
ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
|
||
|
add v19.4s, v19.4s, v20.4s
|
||
|
|
||
|
add v15.4s, v15.4s, v25.4s
|
||
|
mov x11, #5
|
||
|
dup v20.4s, w11
|
||
|
add v25.4s, v25.4s, v20.4s
|
||
|
|
||
|
zip1 v20.4s, v0.4s, v1.4s
|
||
|
zip2 v21.4s, v0.4s, v1.4s
|
||
|
zip1 v22.4s, v2.4s, v3.4s
|
||
|
zip2 v23.4s, v2.4s, v3.4s
|
||
|
|
||
|
zip1 v0.2d, v20.2d, v22.2d
|
||
|
zip2 v1.2d, v20.2d, v22.2d
|
||
|
zip1 v2.2d, v21.2d, v23.2d
|
||
|
zip2 v3.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v5.4s, v6.4s
|
||
|
zip2 v21.4s, v5.4s, v6.4s
|
||
|
zip1 v22.4s, v7.4s, v8.4s
|
||
|
zip2 v23.4s, v7.4s, v8.4s
|
||
|
|
||
|
zip1 v5.2d, v20.2d, v22.2d
|
||
|
zip2 v6.2d, v20.2d, v22.2d
|
||
|
zip1 v7.2d, v21.2d, v23.2d
|
||
|
zip2 v8.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v10.4s, v11.4s
|
||
|
zip2 v21.4s, v10.4s, v11.4s
|
||
|
zip1 v22.4s, v12.4s, v13.4s
|
||
|
zip2 v23.4s, v12.4s, v13.4s
|
||
|
|
||
|
zip1 v10.2d, v20.2d, v22.2d
|
||
|
zip2 v11.2d, v20.2d, v22.2d
|
||
|
zip1 v12.2d, v21.2d, v23.2d
|
||
|
zip2 v13.2d, v21.2d, v23.2d
|
||
|
|
||
|
zip1 v20.4s, v15.4s, v16.4s
|
||
|
zip2 v21.4s, v15.4s, v16.4s
|
||
|
zip1 v22.4s, v17.4s, v18.4s
|
||
|
zip2 v23.4s, v17.4s, v18.4s
|
||
|
|
||
|
zip1 v15.2d, v20.2d, v22.2d
|
||
|
zip2 v16.2d, v20.2d, v22.2d
|
||
|
zip1 v17.2d, v21.2d, v23.2d
|
||
|
zip2 v18.2d, v21.2d, v23.2d
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
add v12.4s, v12.4s, v29.4s
|
||
|
add v17.4s, v17.4s, v30.4s
|
||
|
|
||
|
add v3.4s, v3.4s, v24.4s
|
||
|
add v8.4s, v8.4s, v28.4s
|
||
|
add v13.4s, v13.4s, v29.4s
|
||
|
add v18.4s, v18.4s, v30.4s
|
||
|
|
||
|
add v4.4s, v4.4s, v24.4s
|
||
|
add v9.4s, v9.4s, v28.4s
|
||
|
add v14.4s, v14.4s, v29.4s
|
||
|
add v19.4s, v19.4s, v30.4s
|
||
|
|
||
|
// We can always safely store 192 bytes
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
eor v21.16b, v21.16b, v5.16b
|
||
|
eor v22.16b, v22.16b, v10.16b
|
||
|
eor v23.16b, v23.16b, v15.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v1.16b
|
||
|
eor v21.16b, v21.16b, v6.16b
|
||
|
eor v22.16b, v22.16b, v11.16b
|
||
|
eor v23.16b, v23.16b, v16.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v2.16b
|
||
|
eor v21.16b, v21.16b, v7.16b
|
||
|
eor v22.16b, v22.16b, v12.16b
|
||
|
eor v23.16b, v23.16b, v17.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #192
|
||
|
|
||
|
mov v0.16b, v3.16b
|
||
|
mov v5.16b, v8.16b
|
||
|
mov v10.16b, v13.16b
|
||
|
mov v15.16b, v18.16b
|
||
|
|
||
|
cmp x2, #64
|
||
|
b.lt .Lopen_tail_64_store
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v3.16b
|
||
|
eor v21.16b, v21.16b, v8.16b
|
||
|
eor v22.16b, v22.16b, v13.16b
|
||
|
eor v23.16b, v23.16b, v18.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #64
|
||
|
|
||
|
mov v0.16b, v4.16b
|
||
|
mov v5.16b, v9.16b
|
||
|
mov v10.16b, v14.16b
|
||
|
mov v15.16b, v19.16b
|
||
|
|
||
|
cmp x2, #64
|
||
|
b.lt .Lopen_tail_64_store
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
eor v20.16b, v20.16b, v4.16b
|
||
|
eor v21.16b, v21.16b, v9.16b
|
||
|
eor v22.16b, v22.16b, v14.16b
|
||
|
eor v23.16b, v23.16b, v19.16b
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #64
|
||
|
b .Lopen_main_loop
|
||
|
|
||
|
.Lopen_tail:
|
||
|
|
||
|
cbz x2, .Lopen_finalize
|
||
|
|
||
|
lsr x4, x2, #4 // How many whole blocks we have to hash
|
||
|
|
||
|
cmp x2, #64
|
||
|
b.le .Lopen_tail_64
|
||
|
cmp x2, #128
|
||
|
b.le .Lopen_tail_128
|
||
|
|
||
|
.Lopen_tail_192:
|
||
|
// We need three more blocks
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v1.16b, v24.16b
|
||
|
mov v2.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v6.16b, v28.16b
|
||
|
mov v7.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v11.16b, v29.16b
|
||
|
mov v12.16b, v29.16b
|
||
|
mov v15.16b, v30.16b
|
||
|
mov v16.16b, v30.16b
|
||
|
mov v17.16b, v30.16b
|
||
|
eor v23.16b, v23.16b, v23.16b
|
||
|
eor v21.16b, v21.16b, v21.16b
|
||
|
ins v23.s[0], v25.s[0]
|
||
|
ins v21.d[0], x15
|
||
|
|
||
|
add v22.4s, v23.4s, v21.4s
|
||
|
add v21.4s, v22.4s, v21.4s
|
||
|
|
||
|
add v15.4s, v15.4s, v21.4s
|
||
|
add v16.4s, v16.4s, v23.4s
|
||
|
add v17.4s, v17.4s, v22.4s
|
||
|
|
||
|
mov x7, #10
|
||
|
subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
|
||
|
csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
|
||
|
sub x4, x4, x7
|
||
|
|
||
|
cbz x7, .Lopen_tail_192_rounds_no_hash
|
||
|
|
||
|
.Lopen_tail_192_rounds:
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
.Lopen_tail_192_rounds_no_hash:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v6.16b, v6.16b, v6.16b, #4
|
||
|
ext v7.16b, v7.16b, v7.16b, #4
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
ext v16.16b, v16.16b, v16.16b, #12
|
||
|
ext v17.16b, v17.16b, v17.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v6.16b, v6.16b, v6.16b, #12
|
||
|
ext v7.16b, v7.16b, v7.16b, #12
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
ext v16.16b, v16.16b, v16.16b, #4
|
||
|
ext v17.16b, v17.16b, v17.16b, #4
|
||
|
subs x7, x7, #1
|
||
|
b.gt .Lopen_tail_192_rounds
|
||
|
subs x6, x6, #1
|
||
|
b.ge .Lopen_tail_192_rounds_no_hash
|
||
|
|
||
|
// We hashed 160 bytes at most, may still have 32 bytes left
|
||
|
.Lopen_tail_192_hash:
|
||
|
cbz x4, .Lopen_tail_192_hash_done
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
sub x4, x4, #1
|
||
|
b .Lopen_tail_192_hash
|
||
|
|
||
|
.Lopen_tail_192_hash_done:
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
add v12.4s, v12.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
add v17.4s, v17.4s, v30.4s
|
||
|
|
||
|
add v15.4s, v15.4s, v21.4s
|
||
|
add v16.4s, v16.4s, v23.4s
|
||
|
add v17.4s, v17.4s, v22.4s
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
|
||
|
eor v20.16b, v20.16b, v1.16b
|
||
|
eor v21.16b, v21.16b, v6.16b
|
||
|
eor v22.16b, v22.16b, v11.16b
|
||
|
eor v23.16b, v23.16b, v16.16b
|
||
|
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
|
||
|
eor v20.16b, v20.16b, v2.16b
|
||
|
eor v21.16b, v21.16b, v7.16b
|
||
|
eor v22.16b, v22.16b, v12.16b
|
||
|
eor v23.16b, v23.16b, v17.16b
|
||
|
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #128
|
||
|
b .Lopen_tail_64_store
|
||
|
|
||
|
.Lopen_tail_128:
|
||
|
// We need two more blocks
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v1.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v6.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v11.16b, v29.16b
|
||
|
mov v15.16b, v30.16b
|
||
|
mov v16.16b, v30.16b
|
||
|
eor v23.16b, v23.16b, v23.16b
|
||
|
eor v22.16b, v22.16b, v22.16b
|
||
|
ins v23.s[0], v25.s[0]
|
||
|
ins v22.d[0], x15
|
||
|
add v22.4s, v22.4s, v23.4s
|
||
|
|
||
|
add v15.4s, v15.4s, v22.4s
|
||
|
add v16.4s, v16.4s, v23.4s
|
||
|
|
||
|
mov x6, #10
|
||
|
sub x6, x6, x4
|
||
|
|
||
|
.Lopen_tail_128_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
rev32 v16.8h, v16.8h
|
||
|
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
ushr v20.4s, v6.4s, #20
|
||
|
sli v20.4s, v6.4s, #12
|
||
|
add v1.4s, v1.4s, v20.4s
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
eor v20.16b, v20.16b, v11.16b
|
||
|
ushr v6.4s, v20.4s, #25
|
||
|
sli v6.4s, v20.4s, #7
|
||
|
ext v6.16b, v6.16b, v6.16b, #4
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v16.16b, v16.16b, v16.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
rev32 v16.8h, v16.8h
|
||
|
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
ushr v20.4s, v6.4s, #20
|
||
|
sli v20.4s, v6.4s, #12
|
||
|
add v1.4s, v1.4s, v20.4s
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
eor v20.16b, v20.16b, v11.16b
|
||
|
ushr v6.4s, v20.4s, #25
|
||
|
sli v6.4s, v20.4s, #7
|
||
|
ext v6.16b, v6.16b, v6.16b, #12
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v16.16b, v16.16b, v16.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.gt .Lopen_tail_128_rounds
|
||
|
cbz x4, .Lopen_tail_128_rounds_done
|
||
|
subs x4, x4, #1
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
b .Lopen_tail_128_rounds
|
||
|
|
||
|
.Lopen_tail_128_rounds_done:
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
add v15.4s, v15.4s, v22.4s
|
||
|
add v16.4s, v16.4s, v23.4s
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
|
||
|
eor v20.16b, v20.16b, v1.16b
|
||
|
eor v21.16b, v21.16b, v6.16b
|
||
|
eor v22.16b, v22.16b, v11.16b
|
||
|
eor v23.16b, v23.16b, v16.16b
|
||
|
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
sub x2, x2, #64
|
||
|
|
||
|
b .Lopen_tail_64_store
|
||
|
|
||
|
.Lopen_tail_64:
|
||
|
// We just need a single block
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v15.16b, v30.16b
|
||
|
eor v23.16b, v23.16b, v23.16b
|
||
|
ins v23.s[0], v25.s[0]
|
||
|
add v15.4s, v15.4s, v23.4s
|
||
|
|
||
|
mov x6, #10
|
||
|
sub x6, x6, x4
|
||
|
|
||
|
.Lopen_tail_64_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.gt .Lopen_tail_64_rounds
|
||
|
cbz x4, .Lopen_tail_64_rounds_done
|
||
|
subs x4, x4, #1
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
b .Lopen_tail_64_rounds
|
||
|
|
||
|
.Lopen_tail_64_rounds_done:
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
add v15.4s, v15.4s, v23.4s
|
||
|
|
||
|
.Lopen_tail_64_store:
|
||
|
cmp x2, #16
|
||
|
b.lt .Lopen_tail_16
|
||
|
|
||
|
ld1 {v20.16b}, [x1], #16
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
st1 {v20.16b}, [x0], #16
|
||
|
mov v0.16b, v5.16b
|
||
|
mov v5.16b, v10.16b
|
||
|
mov v10.16b, v15.16b
|
||
|
sub x2, x2, #16
|
||
|
b .Lopen_tail_64_store
|
||
|
|
||
|
.Lopen_tail_16:
|
||
|
// Here we handle the last [0,16) bytes that require a padded block
|
||
|
cbz x2, .Lopen_finalize
|
||
|
|
||
|
eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
|
||
|
eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
|
||
|
not v22.16b, v20.16b
|
||
|
|
||
|
add x7, x1, x2
|
||
|
mov x6, x2
|
||
|
|
||
|
.Lopen_tail_16_compose:
|
||
|
ext v20.16b, v20.16b, v20.16b, #15
|
||
|
ldrb w11, [x7, #-1]!
|
||
|
mov v20.b[0], w11
|
||
|
ext v21.16b, v22.16b, v21.16b, #15
|
||
|
subs x2, x2, #1
|
||
|
b.gt .Lopen_tail_16_compose
|
||
|
|
||
|
and v20.16b, v20.16b, v21.16b
|
||
|
// Hash in the final padded block
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
|
||
|
.Lopen_tail_16_store:
|
||
|
umov w11, v20.b[0]
|
||
|
strb w11, [x0], #1
|
||
|
ext v20.16b, v20.16b, v20.16b, #1
|
||
|
subs x6, x6, #1
|
||
|
b.gt .Lopen_tail_16_store
|
||
|
|
||
|
.Lopen_finalize:
|
||
|
mov x11, v31.d[0]
|
||
|
mov x12, v31.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
// Final reduction step
|
||
|
sub x12, xzr, x15
|
||
|
orr x13, xzr, #3
|
||
|
subs x11, x8, #-5
|
||
|
sbcs x12, x9, x12
|
||
|
sbcs x13, x10, x13
|
||
|
csel x8, x11, x8, cs
|
||
|
csel x9, x12, x9, cs
|
||
|
csel x10, x13, x10, cs
|
||
|
mov x11, v27.d[0]
|
||
|
mov x12, v27.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
|
||
|
stp x8, x9, [x5]
|
||
|
|
||
|
ldp d8, d9, [sp, #16]
|
||
|
ldp d10, d11, [sp, #32]
|
||
|
ldp d12, d13, [sp, #48]
|
||
|
ldp d14, d15, [sp, #64]
|
||
|
.cfi_restore b15
|
||
|
.cfi_restore b14
|
||
|
.cfi_restore b13
|
||
|
.cfi_restore b12
|
||
|
.cfi_restore b11
|
||
|
.cfi_restore b10
|
||
|
.cfi_restore b9
|
||
|
.cfi_restore b8
|
||
|
ldp x29, x30, [sp], 80
|
||
|
.cfi_restore w29
|
||
|
.cfi_restore w30
|
||
|
.cfi_def_cfa_offset 0
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
.Lopen_128:
|
||
|
// On some architectures preparing 5 blocks for small buffers is wasteful
|
||
|
eor v25.16b, v25.16b, v25.16b
|
||
|
mov x11, #1
|
||
|
mov v25.s[0], w11
|
||
|
mov v0.16b, v24.16b
|
||
|
mov v1.16b, v24.16b
|
||
|
mov v2.16b, v24.16b
|
||
|
mov v5.16b, v28.16b
|
||
|
mov v6.16b, v28.16b
|
||
|
mov v7.16b, v28.16b
|
||
|
mov v10.16b, v29.16b
|
||
|
mov v11.16b, v29.16b
|
||
|
mov v12.16b, v29.16b
|
||
|
mov v17.16b, v30.16b
|
||
|
add v15.4s, v17.4s, v25.4s
|
||
|
add v16.4s, v15.4s, v25.4s
|
||
|
|
||
|
mov x6, #10
|
||
|
|
||
|
.Lopen_128_rounds:
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #4
|
||
|
ext v6.16b, v6.16b, v6.16b, #4
|
||
|
ext v7.16b, v7.16b, v7.16b, #4
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #12
|
||
|
ext v16.16b, v16.16b, v16.16b, #12
|
||
|
ext v17.16b, v17.16b, v17.16b, #12
|
||
|
add v0.4s, v0.4s, v5.4s
|
||
|
add v1.4s, v1.4s, v6.4s
|
||
|
add v2.4s, v2.4s, v7.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
rev32 v15.8h, v15.8h
|
||
|
rev32 v16.8h, v16.8h
|
||
|
rev32 v17.8h, v17.8h
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v5.16b, v5.16b, v10.16b
|
||
|
eor v6.16b, v6.16b, v11.16b
|
||
|
eor v7.16b, v7.16b, v12.16b
|
||
|
ushr v20.4s, v5.4s, #20
|
||
|
sli v20.4s, v5.4s, #12
|
||
|
ushr v5.4s, v6.4s, #20
|
||
|
sli v5.4s, v6.4s, #12
|
||
|
ushr v6.4s, v7.4s, #20
|
||
|
sli v6.4s, v7.4s, #12
|
||
|
|
||
|
add v0.4s, v0.4s, v20.4s
|
||
|
add v1.4s, v1.4s, v5.4s
|
||
|
add v2.4s, v2.4s, v6.4s
|
||
|
eor v15.16b, v15.16b, v0.16b
|
||
|
eor v16.16b, v16.16b, v1.16b
|
||
|
eor v17.16b, v17.16b, v2.16b
|
||
|
tbl v15.16b, {v15.16b}, v26.16b
|
||
|
tbl v16.16b, {v16.16b}, v26.16b
|
||
|
tbl v17.16b, {v17.16b}, v26.16b
|
||
|
|
||
|
add v10.4s, v10.4s, v15.4s
|
||
|
add v11.4s, v11.4s, v16.4s
|
||
|
add v12.4s, v12.4s, v17.4s
|
||
|
eor v20.16b, v20.16b, v10.16b
|
||
|
eor v5.16b, v5.16b, v11.16b
|
||
|
eor v6.16b, v6.16b, v12.16b
|
||
|
ushr v7.4s, v6.4s, #25
|
||
|
sli v7.4s, v6.4s, #7
|
||
|
ushr v6.4s, v5.4s, #25
|
||
|
sli v6.4s, v5.4s, #7
|
||
|
ushr v5.4s, v20.4s, #25
|
||
|
sli v5.4s, v20.4s, #7
|
||
|
|
||
|
ext v5.16b, v5.16b, v5.16b, #12
|
||
|
ext v6.16b, v6.16b, v6.16b, #12
|
||
|
ext v7.16b, v7.16b, v7.16b, #12
|
||
|
|
||
|
ext v10.16b, v10.16b, v10.16b, #8
|
||
|
ext v11.16b, v11.16b, v11.16b, #8
|
||
|
ext v12.16b, v12.16b, v12.16b, #8
|
||
|
|
||
|
ext v15.16b, v15.16b, v15.16b, #4
|
||
|
ext v16.16b, v16.16b, v16.16b, #4
|
||
|
ext v17.16b, v17.16b, v17.16b, #4
|
||
|
subs x6, x6, #1
|
||
|
b.hi .Lopen_128_rounds
|
||
|
|
||
|
add v0.4s, v0.4s, v24.4s
|
||
|
add v1.4s, v1.4s, v24.4s
|
||
|
add v2.4s, v2.4s, v24.4s
|
||
|
|
||
|
add v5.4s, v5.4s, v28.4s
|
||
|
add v6.4s, v6.4s, v28.4s
|
||
|
add v7.4s, v7.4s, v28.4s
|
||
|
|
||
|
add v10.4s, v10.4s, v29.4s
|
||
|
add v11.4s, v11.4s, v29.4s
|
||
|
|
||
|
add v30.4s, v30.4s, v25.4s
|
||
|
add v15.4s, v15.4s, v30.4s
|
||
|
add v30.4s, v30.4s, v25.4s
|
||
|
add v16.4s, v16.4s, v30.4s
|
||
|
|
||
|
and v2.16b, v2.16b, v27.16b
|
||
|
mov x16, v2.d[0] // Move the R key to GPRs
|
||
|
mov x17, v2.d[1]
|
||
|
mov v27.16b, v7.16b // Store the S key
|
||
|
|
||
|
bl .Lpoly_hash_ad_internal
|
||
|
|
||
|
.Lopen_128_store:
|
||
|
cmp x2, #64
|
||
|
b.lt .Lopen_128_store_64
|
||
|
|
||
|
ld1 {v20.16b - v23.16b}, [x1], #64
|
||
|
|
||
|
mov x11, v20.d[0]
|
||
|
mov x12, v20.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v21.d[0]
|
||
|
mov x12, v21.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v22.d[0]
|
||
|
mov x12, v22.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
mov x11, v23.d[0]
|
||
|
mov x12, v23.d[1]
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
|
||
|
eor v20.16b, v20.16b, v0.16b
|
||
|
eor v21.16b, v21.16b, v5.16b
|
||
|
eor v22.16b, v22.16b, v10.16b
|
||
|
eor v23.16b, v23.16b, v15.16b
|
||
|
|
||
|
st1 {v20.16b - v23.16b}, [x0], #64
|
||
|
|
||
|
sub x2, x2, #64
|
||
|
|
||
|
mov v0.16b, v1.16b
|
||
|
mov v5.16b, v6.16b
|
||
|
mov v10.16b, v11.16b
|
||
|
mov v15.16b, v16.16b
|
||
|
|
||
|
.Lopen_128_store_64:
|
||
|
|
||
|
lsr x4, x2, #4
|
||
|
mov x3, x1
|
||
|
|
||
|
.Lopen_128_hash_64:
|
||
|
cbz x4, .Lopen_tail_64_store
|
||
|
ldp x11, x12, [x3], 16
|
||
|
adds x8, x8, x11
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, x15
|
||
|
mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
|
||
|
umulh x12, x8, x16
|
||
|
mul x13, x9, x16
|
||
|
umulh x14, x9, x16
|
||
|
adds x12, x12, x13
|
||
|
mul x13, x10, x16
|
||
|
adc x13, x13, x14
|
||
|
mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
|
||
|
umulh x8, x8, x17
|
||
|
adds x12, x12, x14
|
||
|
mul x14, x9, x17
|
||
|
umulh x9, x9, x17
|
||
|
adcs x14, x14, x8
|
||
|
mul x10, x10, x17
|
||
|
adc x10, x10, x9
|
||
|
adds x13, x13, x14
|
||
|
adc x14, x10, xzr
|
||
|
and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
|
||
|
and x8, x13, #-4
|
||
|
extr x13, x14, x13, #2
|
||
|
adds x8, x8, x11
|
||
|
lsr x11, x14, #2
|
||
|
adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
|
||
|
adds x8, x8, x13
|
||
|
adcs x9, x9, x12
|
||
|
adc x10, x10, xzr // At this point acc2 has the value of 4 at most
|
||
|
sub x4, x4, #1
|
||
|
b .Lopen_128_hash_64
|
||
|
.cfi_endproc
|
||
|
.size chacha20_poly1305_open,.-chacha20_poly1305_open
|
||
|
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
|