grpc_3rd/boringssl-with-bazel/ios-arm/crypto/fipsmodule/aesv8-armx32.S

// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.

#if !defined(__has_feature)
#define __has_feature(x) 0
#endif
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
#define OPENSSL_NO_ASM
#endif

#if !defined(OPENSSL_NO_ASM)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
#include <openssl/arm_arch.h>

#if __ARM_MAX_ARCH__>=7
.text


.code	32
#undef	__thumb2__
.align	5
Lrcon:
.long	0x01,0x01,0x01,0x01
.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
.long	0x1b,0x1b,0x1b,0x1b

.text

.globl	_aes_hw_set_encrypt_key
.private_extern	_aes_hw_set_encrypt_key
#ifdef __thumb2__
.thumb_func	_aes_hw_set_encrypt_key
#endif
.align	5
_aes_hw_set_encrypt_key:
Lenc_key:
	mov	r3,#-1
	cmp	r0,#0
	beq	Lenc_key_abort
	cmp	r2,#0
	beq	Lenc_key_abort
	mov	r3,#-2
	cmp	r1,#128
	blt	Lenc_key_abort
	cmp	r1,#256
	bgt	Lenc_key_abort
	tst	r1,#0x3f
	bne	Lenc_key_abort

	adr	r3,Lrcon
	cmp	r1,#192

	veor	q0,q0,q0
	vld1.8	{q3},[r0]!
	mov	r1,#8		@ reuse r1
	vld1.32	{q1,q2},[r3]!

	blt	Loop128
	beq	L192
	b	L256

.align	4
Loop128:
	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10
	bne	Loop128

	vld1.32	{q1},[r3]

	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10

	vtbl.8	d20,{q3},d4
	vtbl.8	d21,{q3},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q3},[r2]!
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q10,q10,q1
	veor	q3,q3,q9
	veor	q3,q3,q10
	vst1.32	{q3},[r2]
	add	r2,r2,#0x50

	mov	r12,#10
	b	Ldone

.align	4
L192:
	vld1.8	{d16},[r0]!
	vmov.i8	q10,#8			@ borrow q10
	vst1.32	{q3},[r2]!
	vsub.i8	q2,q2,q10	@ adjust the mask

Loop192:
	vtbl.8	d20,{q8},d4
	vtbl.8	d21,{q8},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{d16},[r2]!
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9

	vdup.32	q9,d7[1]
	veor	q9,q9,q8
	veor	q10,q10,q1
	vext.8	q8,q0,q8,#12
	vshl.u8	q1,q1,#1
	veor	q8,q8,q9
	veor	q3,q3,q10
	veor	q8,q8,q10
	vst1.32	{q3},[r2]!
	bne	Loop192

	mov	r12,#12
	add	r2,r2,#0x20
	b	Ldone

.align	4
L256:
	vld1.8	{q8},[r0]
	mov	r1,#7
	mov	r12,#14
	vst1.32	{q3},[r2]!

Loop256:
	vtbl.8	d20,{q8},d4
	vtbl.8	d21,{q8},d5
	vext.8	q9,q0,q3,#12
	vst1.32	{q8},[r2]!
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
	subs	r1,r1,#1

	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q3,q3,q9
	vext.8	q9,q0,q9,#12
	veor	q10,q10,q1
	veor	q3,q3,q9
	vshl.u8	q1,q1,#1
	veor	q3,q3,q10
	vst1.32	{q3},[r2]!
	beq	Ldone

	vdup.32	q10,d7[1]
	vext.8	q9,q0,q8,#12
.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0

	veor	q8,q8,q9
	vext.8	q9,q0,q9,#12
	veor	q8,q8,q9
	vext.8	q9,q0,q9,#12
	veor	q8,q8,q9

	veor	q8,q8,q10
	b	Loop256

Ldone:
	str	r12,[r2]
	mov	r3,#0

Lenc_key_abort:
	mov	r0,r3			@ return value

	bx	lr


.globl	_aes_hw_set_decrypt_key
.private_extern	_aes_hw_set_decrypt_key
#ifdef __thumb2__
.thumb_func	_aes_hw_set_decrypt_key
#endif
.align	5
_aes_hw_set_decrypt_key:
	stmdb	sp!,{r4,lr}
	bl	Lenc_key

	cmp	r0,#0
	bne	Ldec_key_abort

	sub	r2,r2,#240		@ restore original r2
	mov	r4,#-16
	add	r0,r2,r12,lsl#4	@ end of key schedule

	vld1.32	{q0},[r2]
	vld1.32	{q1},[r0]
	vst1.32	{q0},[r0],r4
	vst1.32	{q1},[r2]!

Loop_imc:
	vld1.32	{q0},[r2]
	vld1.32	{q1},[r0]
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
	vst1.32	{q0},[r0],r4
	vst1.32	{q1},[r2]!
	cmp	r0,r2
	bhi	Loop_imc

	vld1.32	{q0},[r2]
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
	vst1.32	{q0},[r0]

	eor	r0,r0,r0		@ return value
Ldec_key_abort:
	ldmia	sp!,{r4,pc}

.globl	_aes_hw_encrypt
.private_extern	_aes_hw_encrypt
#ifdef __thumb2__
.thumb_func	_aes_hw_encrypt
#endif
.align	5
_aes_hw_encrypt:
	AARCH64_VALID_CALL_TARGET
	ldr	r3,[r2,#240]
	vld1.32	{q0},[r2]!
	vld1.8	{q2},[r0]
	sub	r3,r3,#2
	vld1.32	{q1},[r2]!

Loop_enc:
.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	vld1.32	{q0},[r2]!
	subs	r3,r3,#2
.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	vld1.32	{q1},[r2]!
	bgt	Loop_enc

.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
	vld1.32	{q0},[r2]
.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
	veor	q2,q2,q0

	vst1.8	{q2},[r1]
	bx	lr

.globl	_aes_hw_decrypt
.private_extern	_aes_hw_decrypt
#ifdef __thumb2__
.thumb_func	_aes_hw_decrypt
#endif
.align	5
_aes_hw_decrypt:
	AARCH64_VALID_CALL_TARGET
	ldr	r3,[r2,#240]
	vld1.32	{q0},[r2]!
	vld1.8	{q2},[r0]
	sub	r3,r3,#2
	vld1.32	{q1},[r2]!

Loop_dec:
.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	vld1.32	{q0},[r2]!
	subs	r3,r3,#2
.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	vld1.32	{q1},[r2]!
	bgt	Loop_dec

.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
	vld1.32	{q0},[r2]
.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
	veor	q2,q2,q0

	vst1.8	{q2},[r1]
	bx	lr

.globl	_aes_hw_cbc_encrypt
.private_extern	_aes_hw_cbc_encrypt
#ifdef __thumb2__
.thumb_func	_aes_hw_cbc_encrypt
#endif
.align	5
_aes_hw_cbc_encrypt:
	mov	ip,sp
	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
	ldmia	ip,{r4,r5}		@ load remaining args
	subs	r2,r2,#16
	mov	r8,#16
	blo	Lcbc_abort
	moveq	r8,#0

	cmp	r5,#0			@ en- or decrypting?
	ldr	r5,[r3,#240]
	and	r2,r2,#-16
	vld1.8	{q6},[r4]
	vld1.8	{q0},[r0],r8

	vld1.32	{q8,q9},[r3]		@ load key schedule...
	sub	r5,r5,#6
	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
	sub	r5,r5,#2
	vld1.32	{q10,q11},[r7]!
	vld1.32	{q12,q13},[r7]!
	vld1.32	{q14,q15},[r7]!
	vld1.32	{q7},[r7]

	add	r7,r3,#32
	mov	r6,r5
	beq	Lcbc_dec

	cmp	r5,#2
	veor	q0,q0,q6
	veor	q5,q8,q7
	beq	Lcbc_enc128

	vld1.32	{q2,q3},[r7]
	add	r7,r3,#16
	add	r6,r3,#16*4
	add	r12,r3,#16*5
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	add	r14,r3,#16*6
	add	r3,r3,#16*7
	b	Lenter_cbc_enc

.align	4
Loop_cbc_enc:
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vst1.8	{q6},[r1]!
Lenter_cbc_enc:
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.32	{q8},[r6]
	cmp	r5,#4
.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.32	{q9},[r12]
	beq	Lcbc_enc192

.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.32	{q8},[r14]
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.32	{q9},[r3]
	nop

Lcbc_enc192:
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	subs	r2,r2,#16
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	moveq	r8,#0
.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.8	{q8},[r0],r8
.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	veor	q8,q8,q5
.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
	veor	q6,q0,q7
	bhs	Loop_cbc_enc

	vst1.8	{q6},[r1]!
	b	Lcbc_done

.align	5
Lcbc_enc128:
	vld1.32	{q2,q3},[r7]
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	b	Lenter_cbc_enc128
Loop_cbc_enc128:
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vst1.8	{q6},[r1]!
Lenter_cbc_enc128:
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	subs	r2,r2,#16
.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	moveq	r8,#0
.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	vld1.8	{q8},[r0],r8
.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
	veor	q8,q8,q5
.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
	veor	q6,q0,q7
	bhs	Loop_cbc_enc128

	vst1.8	{q6},[r1]!
	b	Lcbc_done
.align	5
Lcbc_dec:
	vld1.8	{q10},[r0]!
	subs	r2,r2,#32		@ bias
	add	r6,r5,#2
	vorr	q3,q0,q0
	vorr	q1,q0,q0
	vorr	q11,q10,q10
	blo	Lcbc_dec_tail

	vorr	q1,q10,q10
	vld1.8	{q10},[r0]!
	vorr	q2,q0,q0
	vorr	q3,q1,q1
	vorr	q11,q10,q10

Loop3x_cbc_dec:
.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.32	{q8},[r7]!
	subs	r6,r6,#2
.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.32	{q9},[r7]!
	bgt	Loop3x_cbc_dec

.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	veor	q4,q6,q7
	subs	r2,r2,#0x30
	veor	q5,q2,q7
	movlo	r6,r2			@ r6, r6, is zero at this point
.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	veor	q9,q3,q7
	add	r0,r0,r6		@ r0 is adjusted in such way that
					@ at exit from the loop q1-q10
					@ are loaded with last "words"
	vorr	q6,q11,q11
	mov	r7,r3
.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.8	{q2},[r0]!
.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.8	{q3},[r0]!
.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.8	{q11},[r0]!
.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
	add	r6,r5,#2
	veor	q4,q4,q0
	veor	q5,q5,q1
	veor	q10,q10,q9
	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
	vst1.8	{q4},[r1]!
	vorr	q0,q2,q2
	vst1.8	{q5},[r1]!
	vorr	q1,q3,q3
	vst1.8	{q10},[r1]!
	vorr	q10,q11,q11
	bhs	Loop3x_cbc_dec

	cmn	r2,#0x30
	beq	Lcbc_done
	nop

Lcbc_dec_tail:
.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.32	{q8},[r7]!
	subs	r6,r6,#2
.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	vld1.32	{q9},[r7]!
	bgt	Lcbc_dec_tail

.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	cmn	r2,#0x20
.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	veor	q5,q6,q7
.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
	veor	q9,q3,q7
.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
	beq	Lcbc_dec_one
	veor	q5,q5,q1
	veor	q9,q9,q10
	vorr	q6,q11,q11
	vst1.8	{q5},[r1]!
	vst1.8	{q9},[r1]!
	b	Lcbc_done

Lcbc_dec_one:
	veor	q5,q5,q10
	vorr	q6,q11,q11
	vst1.8	{q5},[r1]!

Lcbc_done:
	vst1.8	{q6},[r4]
Lcbc_abort:
	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
	ldmia	sp!,{r4,r5,r6,r7,r8,pc}

.globl	_aes_hw_ctr32_encrypt_blocks
.private_extern	_aes_hw_ctr32_encrypt_blocks
#ifdef __thumb2__
.thumb_func	_aes_hw_ctr32_encrypt_blocks
#endif
.align	5
_aes_hw_ctr32_encrypt_blocks:
	mov	ip,sp
	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
	ldr	r4, [ip]		@ load remaining arg
	ldr	r5,[r3,#240]

	ldr	r8, [r4, #12]
	vld1.32	{q0},[r4]

	vld1.32	{q8,q9},[r3]		@ load key schedule...
	sub	r5,r5,#4
	mov	r12,#16
	cmp	r2,#2
	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
	sub	r5,r5,#2
	vld1.32	{q12,q13},[r7]!
	vld1.32	{q14,q15},[r7]!
	vld1.32	{q7},[r7]
	add	r7,r3,#32
	mov	r6,r5
	movlo	r12,#0

	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
	@ affected by silicon errata #1742098 [0] and #1655431 [1],
	@ respectively, where the second instruction of an aese/aesmc
	@ instruction pair may execute twice if an interrupt is taken right
	@ after the first instruction consumes an input register of which a
	@ single 32-bit lane has been updated the last time it was modified.
	@ 
	@ This function uses a counter in one 32-bit lane. The 
	@ could write to q1 and q10 directly, but that trips this bugs.
	@ We write to q6 and copy to the final register as a workaround.
	@ 
	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __ARMEB__
	rev	r8, r8
#endif
	add	r10, r8, #1
	vorr	q6,q0,q0
	rev	r10, r10
	vmov.32	d13[1],r10
	add	r8, r8, #2
	vorr	q1,q6,q6
	bls	Lctr32_tail
	rev	r12, r8
	vmov.32	d13[1],r12
	sub	r2,r2,#3		@ bias
	vorr	q10,q6,q6
	b	Loop3x_ctr32

.align	4
Loop3x_ctr32:
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
	vld1.32	{q8},[r7]!
	subs	r6,r6,#2
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
	vld1.32	{q9},[r7]!
	bgt	Loop3x_ctr32

.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
	vld1.8	{q2},[r0]!
	add	r9,r8,#1
.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
	vld1.8	{q3},[r0]!
	rev	r9,r9
.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	vld1.8	{q11},[r0]!
	mov	r7,r3
.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	veor	q2,q2,q7
	add	r10,r8,#2
.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
	veor	q3,q3,q7
	add	r8,r8,#3
.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	 @ Note the logic to update q0, q1, and q1 is written to work
	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
	 @ 32-bit mode. See the comment above.
	veor	q11,q11,q7
	vmov.32	d13[1], r9
.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
	vorr	q0,q6,q6
	rev	r10,r10
.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
	vmov.32	d13[1], r10
	rev	r12,r8
.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
	vorr	q1,q6,q6
	vmov.32	d13[1], r12
.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
	vorr	q10,q6,q6
	subs	r2,r2,#3
.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15

	veor	q2,q2,q4
	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
	vst1.8	{q2},[r1]!
	veor	q3,q3,q5
	mov	r6,r5
	vst1.8	{q3},[r1]!
	veor	q11,q11,q9
	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
	vst1.8	{q11},[r1]!
	bhs	Loop3x_ctr32

	adds	r2,r2,#3
	beq	Lctr32_done
	cmp	r2,#1
	mov	r12,#16
	moveq	r12,#0

Lctr32_tail:
.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	vld1.32	{q8},[r7]!
	subs	r6,r6,#2
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	vld1.32	{q9},[r7]!
	bgt	Lctr32_tail

.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	vld1.8	{q2},[r0],r12
.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	vld1.8	{q3},[r0]
.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	veor	q2,q2,q7
.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
	veor	q3,q3,q7
.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15

	cmp	r2,#1
	veor	q2,q2,q0
	veor	q3,q3,q1
	vst1.8	{q2},[r1]!
	beq	Lctr32_done
	vst1.8	{q3},[r1]

Lctr32_done:
	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}

#endif
#endif  // !OPENSSL_NO_ASM
grpc 1.37.0 依赖 2 years ago			`// This file is generated from a similarly-named Perl script in the BoringSSL`
			`// source tree. Do not edit by hand.`

			`#if !defined(__has_feature)`
			`#define __has_feature(x) 0`
			`#endif`
			`#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)`
			`#define OPENSSL_NO_ASM`
			`#endif`

			`#if !defined(OPENSSL_NO_ASM)`
			`#if defined(BORINGSSL_PREFIX)`
			`#include <boringssl_prefix_symbols_asm.h>`
			`#endif`
			`#include <openssl/arm_arch.h>`

			`#if __ARM_MAX_ARCH__>=7`
			`.text`


			`.code 32`
			`#undef __thumb2__`
			`.align 5`
			`Lrcon:`
			`.long 0x01,0x01,0x01,0x01`
			`.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat`
			`.long 0x1b,0x1b,0x1b,0x1b`

			`.text`

			`.globl _aes_hw_set_encrypt_key`
			`.private_extern _aes_hw_set_encrypt_key`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_set_encrypt_key`
			`#endif`
			`.align 5`
			`_aes_hw_set_encrypt_key:`
			`Lenc_key:`
			`mov r3,#-1`
			`cmp r0,#0`
			`beq Lenc_key_abort`
			`cmp r2,#0`
			`beq Lenc_key_abort`
			`mov r3,#-2`
			`cmp r1,#128`
			`blt Lenc_key_abort`
			`cmp r1,#256`
			`bgt Lenc_key_abort`
			`tst r1,#0x3f`
			`bne Lenc_key_abort`

			`adr r3,Lrcon`
			`cmp r1,#192`

			`veor q0,q0,q0`
			`vld1.8 {q3},[r0]!`
			`mov r1,#8 @ reuse r1`
			`vld1.32 {q1,q2},[r3]!`

			`blt Loop128`
			`beq L192`
			`b L256`

			`.align 4`
			`Loop128:`
			`vtbl.8 d20,{q3},d4`
			`vtbl.8 d21,{q3},d5`
			`vext.8 q9,q0,q3,#12`
			`vst1.32 {q3},[r2]!`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`
			`subs r1,r1,#1`

			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q10,q10,q1`
			`veor q3,q3,q9`
			`vshl.u8 q1,q1,#1`
			`veor q3,q3,q10`
			`bne Loop128`

			`vld1.32 {q1},[r3]`

			`vtbl.8 d20,{q3},d4`
			`vtbl.8 d21,{q3},d5`
			`vext.8 q9,q0,q3,#12`
			`vst1.32 {q3},[r2]!`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`

			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q10,q10,q1`
			`veor q3,q3,q9`
			`vshl.u8 q1,q1,#1`
			`veor q3,q3,q10`

			`vtbl.8 d20,{q3},d4`
			`vtbl.8 d21,{q3},d5`
			`vext.8 q9,q0,q3,#12`
			`vst1.32 {q3},[r2]!`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`

			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q10,q10,q1`
			`veor q3,q3,q9`
			`veor q3,q3,q10`
			`vst1.32 {q3},[r2]`
			`add r2,r2,#0x50`

			`mov r12,#10`
			`b Ldone`

			`.align 4`
			`L192:`
			`vld1.8 {d16},[r0]!`
			`vmov.i8 q10,#8 @ borrow q10`
			`vst1.32 {q3},[r2]!`
			`vsub.i8 q2,q2,q10 @ adjust the mask`

			`Loop192:`
			`vtbl.8 d20,{q8},d4`
			`vtbl.8 d21,{q8},d5`
			`vext.8 q9,q0,q3,#12`
			`vst1.32 {d16},[r2]!`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`
			`subs r1,r1,#1`

			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`

			`vdup.32 q9,d7[1]`
			`veor q9,q9,q8`
			`veor q10,q10,q1`
			`vext.8 q8,q0,q8,#12`
			`vshl.u8 q1,q1,#1`
			`veor q8,q8,q9`
			`veor q3,q3,q10`
			`veor q8,q8,q10`
			`vst1.32 {q3},[r2]!`
			`bne Loop192`

			`mov r12,#12`
			`add r2,r2,#0x20`
			`b Ldone`

			`.align 4`
			`L256:`
			`vld1.8 {q8},[r0]`
			`mov r1,#7`
			`mov r12,#14`
			`vst1.32 {q3},[r2]!`

			`Loop256:`
			`vtbl.8 d20,{q8},d4`
			`vtbl.8 d21,{q8},d5`
			`vext.8 q9,q0,q3,#12`
			`vst1.32 {q8},[r2]!`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`
			`subs r1,r1,#1`

			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q3,q3,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q10,q10,q1`
			`veor q3,q3,q9`
			`vshl.u8 q1,q1,#1`
			`veor q3,q3,q10`
			`vst1.32 {q3},[r2]!`
			`beq Ldone`

			`vdup.32 q10,d7[1]`
			`vext.8 q9,q0,q8,#12`
			`.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0`

			`veor q8,q8,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q8,q8,q9`
			`vext.8 q9,q0,q9,#12`
			`veor q8,q8,q9`

			`veor q8,q8,q10`
			`b Loop256`

			`Ldone:`
			`str r12,[r2]`
			`mov r3,#0`

			`Lenc_key_abort:`
			`mov r0,r3 @ return value`

			`bx lr`


			`.globl _aes_hw_set_decrypt_key`
			`.private_extern _aes_hw_set_decrypt_key`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_set_decrypt_key`
			`#endif`
			`.align 5`
			`_aes_hw_set_decrypt_key:`
			`stmdb sp!,{r4,lr}`
			`bl Lenc_key`

			`cmp r0,#0`
			`bne Ldec_key_abort`

			`sub r2,r2,#240 @ restore original r2`
			`mov r4,#-16`
			`add r0,r2,r12,lsl#4 @ end of key schedule`

			`vld1.32 {q0},[r2]`
			`vld1.32 {q1},[r0]`
			`vst1.32 {q0},[r0],r4`
			`vst1.32 {q1},[r2]!`

			`Loop_imc:`
			`vld1.32 {q0},[r2]`
			`vld1.32 {q1},[r0]`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`vst1.32 {q0},[r0],r4`
			`vst1.32 {q1},[r2]!`
			`cmp r0,r2`
			`bhi Loop_imc`

			`vld1.32 {q0},[r2]`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`vst1.32 {q0},[r0]`

			`eor r0,r0,r0 @ return value`
			`Ldec_key_abort:`
			`ldmia sp!,{r4,pc}`

			`.globl _aes_hw_encrypt`
			`.private_extern _aes_hw_encrypt`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_encrypt`
			`#endif`
			`.align 5`
			`_aes_hw_encrypt:`
			`AARCH64_VALID_CALL_TARGET`
			`ldr r3,[r2,#240]`
			`vld1.32 {q0},[r2]!`
			`vld1.8 {q2},[r0]`
			`sub r3,r3,#2`
			`vld1.32 {q1},[r2]!`

			`Loop_enc:`
			`.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0`
			`.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2`
			`vld1.32 {q0},[r2]!`
			`subs r3,r3,#2`
			`.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1`
			`.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2`
			`vld1.32 {q1},[r2]!`
			`bgt Loop_enc`

			`.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0`
			`.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2`
			`vld1.32 {q0},[r2]`
			`.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1`
			`veor q2,q2,q0`

			`vst1.8 {q2},[r1]`
			`bx lr`

			`.globl _aes_hw_decrypt`
			`.private_extern _aes_hw_decrypt`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_decrypt`
			`#endif`
			`.align 5`
			`_aes_hw_decrypt:`
			`AARCH64_VALID_CALL_TARGET`
			`ldr r3,[r2,#240]`
			`vld1.32 {q0},[r2]!`
			`vld1.8 {q2},[r0]`
			`sub r3,r3,#2`
			`vld1.32 {q1},[r2]!`

			`Loop_dec:`
			`.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0`
			`.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2`
			`vld1.32 {q0},[r2]!`
			`subs r3,r3,#2`
			`.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1`
			`.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2`
			`vld1.32 {q1},[r2]!`
			`bgt Loop_dec`

			`.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0`
			`.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2`
			`vld1.32 {q0},[r2]`
			`.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1`
			`veor q2,q2,q0`

			`vst1.8 {q2},[r1]`
			`bx lr`

			`.globl _aes_hw_cbc_encrypt`
			`.private_extern _aes_hw_cbc_encrypt`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_cbc_encrypt`
			`#endif`
			`.align 5`
			`_aes_hw_cbc_encrypt:`
			`mov ip,sp`
			`stmdb sp!,{r4,r5,r6,r7,r8,lr}`
			`vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so`
			`ldmia ip,{r4,r5} @ load remaining args`
			`subs r2,r2,#16`
			`mov r8,#16`
			`blo Lcbc_abort`
			`moveq r8,#0`

			`cmp r5,#0 @ en- or decrypting?`
			`ldr r5,[r3,#240]`
			`and r2,r2,#-16`
			`vld1.8 {q6},[r4]`
			`vld1.8 {q0},[r0],r8`

			`vld1.32 {q8,q9},[r3] @ load key schedule...`
			`sub r5,r5,#6`
			`add r7,r3,r5,lsl#4 @ pointer to last 7 round keys`
			`sub r5,r5,#2`
			`vld1.32 {q10,q11},[r7]!`
			`vld1.32 {q12,q13},[r7]!`
			`vld1.32 {q14,q15},[r7]!`
			`vld1.32 {q7},[r7]`

			`add r7,r3,#32`
			`mov r6,r5`
			`beq Lcbc_dec`

			`cmp r5,#2`
			`veor q0,q0,q6`
			`veor q5,q8,q7`
			`beq Lcbc_enc128`

			`vld1.32 {q2,q3},[r7]`
			`add r7,r3,#16`
			`add r6,r3,#16*4`
			`add r12,r3,#16*5`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`add r14,r3,#16*6`
			`add r3,r3,#16*7`
			`b Lenter_cbc_enc`

			`.align 4`
			`Loop_cbc_enc:`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vst1.8 {q6},[r1]!`
			`Lenter_cbc_enc:`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.32 {q8},[r6]`
			`cmp r5,#4`
			`.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.32 {q9},[r12]`
			`beq Lcbc_enc192`

			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.32 {q8},[r14]`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.32 {q9},[r3]`
			`nop`

			`Lcbc_enc192:`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`subs r2,r2,#16`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`moveq r8,#0`
			`.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.8 {q8},[r0],r8`
			`.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`veor q8,q8,q5`
			`.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.32 {q9},[r7] @ re-pre-load rndkey[1]`
			`.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15`
			`veor q6,q0,q7`
			`bhs Loop_cbc_enc`

			`vst1.8 {q6},[r1]!`
			`b Lcbc_done`

			`.align 5`
			`Lcbc_enc128:`
			`vld1.32 {q2,q3},[r7]`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`b Lenter_cbc_enc128`
			`Loop_cbc_enc128:`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vst1.8 {q6},[r1]!`
			`Lenter_cbc_enc128:`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`subs r2,r2,#16`
			`.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`moveq r8,#0`
			`.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`vld1.8 {q8},[r0],r8`
			`.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`veor q8,q8,q5`
			`.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15`
			`veor q6,q0,q7`
			`bhs Loop_cbc_enc128`

			`vst1.8 {q6},[r1]!`
			`b Lcbc_done`
			`.align 5`
			`Lcbc_dec:`
			`vld1.8 {q10},[r0]!`
			`subs r2,r2,#32 @ bias`
			`add r6,r5,#2`
			`vorr q3,q0,q0`
			`vorr q1,q0,q0`
			`vorr q11,q10,q10`
			`blo Lcbc_dec_tail`

			`vorr q1,q10,q10`
			`vld1.8 {q10},[r0]!`
			`vorr q2,q0,q0`
			`vorr q3,q1,q1`
			`vorr q11,q10,q10`

			`Loop3x_cbc_dec:`
			`.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.32 {q8},[r7]!`
			`subs r6,r6,#2`
			`.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.32 {q9},[r7]!`
			`bgt Loop3x_cbc_dec`

			`.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`veor q4,q6,q7`
			`subs r2,r2,#0x30`
			`veor q5,q2,q7`
			`movlo r6,r2 @ r6, r6, is zero at this point`
			`.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`veor q9,q3,q7`
			`add r0,r0,r6 @ r0 is adjusted in such way that`
			`@ at exit from the loop q1-q10`
			`@ are loaded with last "words"`
			`vorr q6,q11,q11`
			`mov r7,r3`
			`.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.8 {q2},[r0]!`
			`.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.8 {q3},[r0]!`
			`.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14`
			`.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0`
			`.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.8 {q11},[r0]!`
			`.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15`
			`.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15`
			`.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15`
			`vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]`
			`add r6,r5,#2`
			`veor q4,q4,q0`
			`veor q5,q5,q1`
			`veor q10,q10,q9`
			`vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]`
			`vst1.8 {q4},[r1]!`
			`vorr q0,q2,q2`
			`vst1.8 {q5},[r1]!`
			`vorr q1,q3,q3`
			`vst1.8 {q10},[r1]!`
			`vorr q10,q11,q11`
			`bhs Loop3x_cbc_dec`

			`cmn r2,#0x30`
			`beq Lcbc_done`
			`nop`

			`Lcbc_dec_tail:`
			`.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.32 {q8},[r7]!`
			`subs r6,r6,#2`
			`.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`vld1.32 {q9},[r7]!`
			`bgt Lcbc_dec_tail`

			`.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`cmn r2,#0x20`
			`.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`veor q5,q6,q7`
			`.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14`
			`.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1`
			`.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14`
			`.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10`
			`veor q9,q3,q7`
			`.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15`
			`.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15`
			`beq Lcbc_dec_one`
			`veor q5,q5,q1`
			`veor q9,q9,q10`
			`vorr q6,q11,q11`
			`vst1.8 {q5},[r1]!`
			`vst1.8 {q9},[r1]!`
			`b Lcbc_done`

			`Lcbc_dec_one:`
			`veor q5,q5,q10`
			`vorr q6,q11,q11`
			`vst1.8 {q5},[r1]!`

			`Lcbc_done:`
			`vst1.8 {q6},[r4]`
			`Lcbc_abort:`
			`vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}`
			`ldmia sp!,{r4,r5,r6,r7,r8,pc}`

			`.globl _aes_hw_ctr32_encrypt_blocks`
			`.private_extern _aes_hw_ctr32_encrypt_blocks`
			`#ifdef __thumb2__`
			`.thumb_func _aes_hw_ctr32_encrypt_blocks`
			`#endif`
			`.align 5`
			`_aes_hw_ctr32_encrypt_blocks:`
			`mov ip,sp`
			`stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}`
			`vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so`
			`ldr r4, [ip] @ load remaining arg`
			`ldr r5,[r3,#240]`

			`ldr r8, [r4, #12]`
			`vld1.32 {q0},[r4]`

			`vld1.32 {q8,q9},[r3] @ load key schedule...`
			`sub r5,r5,#4`
			`mov r12,#16`
			`cmp r2,#2`
			`add r7,r3,r5,lsl#4 @ pointer to last 5 round keys`
			`sub r5,r5,#2`
			`vld1.32 {q12,q13},[r7]!`
			`vld1.32 {q14,q15},[r7]!`
			`vld1.32 {q7},[r7]`
			`add r7,r3,#32`
			`mov r6,r5`
			`movlo r12,#0`

			`@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are`
			`@ affected by silicon errata #1742098 [0] and #1655431 [1],`
			`@ respectively, where the second instruction of an aese/aesmc`
			`@ instruction pair may execute twice if an interrupt is taken right`
			`@ after the first instruction consumes an input register of which a`
			`@ single 32-bit lane has been updated the last time it was modified.`
			`@`
			`@ This function uses a counter in one 32-bit lane. The`
			`@ could write to q1 and q10 directly, but that trips this bugs.`
			`@ We write to q6 and copy to the final register as a workaround.`
			`@`
			`@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice`
			`@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice`
			`#ifndef __ARMEB__`
			`rev r8, r8`
			`#endif`
			`add r10, r8, #1`
			`vorr q6,q0,q0`
			`rev r10, r10`
			`vmov.32 d13[1],r10`
			`add r8, r8, #2`
			`vorr q1,q6,q6`
			`bls Lctr32_tail`
			`rev r12, r8`
			`vmov.32 d13[1],r12`
			`sub r2,r2,#3 @ bias`
			`vorr q10,q6,q6`
			`b Loop3x_ctr32`

			`.align 4`
			`Loop3x_ctr32:`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8`
			`.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10`
			`vld1.32 {q8},[r7]!`
			`subs r6,r6,#2`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9`
			`.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10`
			`vld1.32 {q9},[r7]!`
			`bgt Loop3x_ctr32`

			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0`
			`.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8`
			`.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1`
			`vld1.8 {q2},[r0]!`
			`add r9,r8,#1`
			`.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8`
			`.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10`
			`vld1.8 {q3},[r0]!`
			`rev r9,r9`
			`.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9`
			`.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4`
			`.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9`
			`.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5`
			`vld1.8 {q11},[r0]!`
			`mov r7,r3`
			`.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9`
			`.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10`
			`.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12`
			`.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4`
			`.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12`
			`.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5`
			`veor q2,q2,q7`
			`add r10,r8,#2`
			`.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12`
			`.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9`
			`veor q3,q3,q7`
			`add r8,r8,#3`
			`.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13`
			`.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4`
			`.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13`
			`.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5`
			`@ Note the logic to update q0, q1, and q1 is written to work`
			`@ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in`
			`@ 32-bit mode. See the comment above.`
			`veor q11,q11,q7`
			`vmov.32 d13[1], r9`
			`.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13`
			`.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9`
			`vorr q0,q6,q6`
			`rev r10,r10`
			`.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14`
			`.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4`
			`vmov.32 d13[1], r10`
			`rev r12,r8`
			`.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14`
			`.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5`
			`vorr q1,q6,q6`
			`vmov.32 d13[1], r12`
			`.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14`
			`.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9`
			`vorr q10,q6,q6`
			`subs r2,r2,#3`
			`.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15`
			`.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15`
			`.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15`

			`veor q2,q2,q4`
			`vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]`
			`vst1.8 {q2},[r1]!`
			`veor q3,q3,q5`
			`mov r6,r5`
			`vst1.8 {q3},[r1]!`
			`veor q11,q11,q9`
			`vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]`
			`vst1.8 {q11},[r1]!`
			`bhs Loop3x_ctr32`

			`adds r2,r2,#3`
			`beq Lctr32_done`
			`cmp r2,#1`
			`mov r12,#16`
			`moveq r12,#0`

			`Lctr32_tail:`
			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`vld1.32 {q8},[r7]!`
			`subs r6,r6,#2`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`vld1.32 {q9},[r7]!`
			`bgt Lctr32_tail`

			`.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`vld1.8 {q2},[r0],r12`
			`.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`vld1.8 {q3},[r0]`
			`.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`veor q2,q2,q7`
			`.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14`
			`.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0`
			`.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14`
			`.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1`
			`veor q3,q3,q7`
			`.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15`
			`.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15`

			`cmp r2,#1`
			`veor q2,q2,q0`
			`veor q3,q3,q1`
			`vst1.8 {q2},[r1]!`
			`beq Lctr32_done`
			`vst1.8 {q3},[r1]`

			`Lctr32_done:`
			`vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}`
			`ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}`

			`#endif`
			`#endif // !OPENSSL_NO_ASM`