@ -1045,3 +1045,405 @@ function ff_hscale16to15_X4_neon_asm, export=1
ret
ret
endfunc
endfunc
function f f _ h s c a l e 1 6 t o 1 9 _ 4 _ n e o n _ a s m , e x p o r t =1
/ / w0 i n t s h i f t
/ / x1 i n t 3 2 _ t * d s t
/ / w2 i n t d s t W
/ / x3 c o n s t u i n t 8 _ t * s r c / / t r e a t i t a s u i n t 1 6 _ t * s r c
/ / x4 c o n s t u i n t 1 6 _ t * f i l t e r
/ / x5 c o n s t i n t 3 2 _ t * f i l t e r P o s
/ / w6 i n t f i l t e r S i z e
movi v18 . 4 s , #1
movi v17 . 4 s , #1
shl v18 . 4 s , v18 . 4 s , #19
sub v18 . 4 s , v18 . 4 s , v17 . 4 s / / m a x a l l o w e d v a l u e
dup v17 . 4 s , w0 / / r e a d s h i f t
neg v17 . 4 s , v17 . 4 s / / n e g a t e i t , s o i t c a n b e u s e d i n s s h l ( e f f e c t i v e l y s h i f t r i g h t )
cmp w2 , #16
b. l t 2 f / / m o v e t o l a s t b l o c k
ldp w8 , w9 , [ x5 ] / / f i l t e r P o s [ 0 ] , f i l t e r P o s [ 1 ]
ldp w10 , w11 , [ x5 , #8 ] / / f i l t e r P o s [ 2 ] , f i l t e r P o s [ 3 ]
ldp w12 , w13 , [ x5 , #16 ] / / f i l t e r P o s [ 4 ] , f i l t e r P o s [ 5 ]
ldp w14 , w15 , [ x5 , #24 ] / / f i l t e r P o s [ 6 ] , f i l t e r P o s [ 7 ]
add x5 , x5 , #32
/ / shift a l l f i l t e r P o s l e f t b y o n e , a s u i n t 1 6 _ t w i l l b e r e a d
lsl x8 , x8 , #1
lsl x9 , x9 , #1
lsl x10 , x10 , #1
lsl x11 , x11 , #1
lsl x12 , x12 , #1
lsl x13 , x13 , #1
lsl x14 , x14 , #1
lsl x15 , x15 , #1
/ / load s r c w i t h g i v e n o f f s e t
ldr x8 , [ x3 , w8 , U X T W ]
ldr x9 , [ x3 , w9 , U X T W ]
ldr x10 , [ x3 , w10 , U X T W ]
ldr x11 , [ x3 , w11 , U X T W ]
ldr x12 , [ x3 , w12 , U X T W ]
ldr x13 , [ x3 , w13 , U X T W ]
ldr x14 , [ x3 , w14 , U X T W ]
ldr x15 , [ x3 , w15 , U X T W ]
sub s p , s p , #64
/ / push s r c o n s t a c k s o i t c a n b e l o a d e d i n t o v e c t o r s l a t e r
stp x8 , x9 , [ s p ]
stp x10 , x11 , [ s p , #16 ]
stp x12 , x13 , [ s p , #32 ]
stp x14 , x15 , [ s p , #48 ]
1 :
ld4 { v0 . 8 h , v1 . 8 h , v2 . 8 h , v3 . 8 h } , [ s p ]
ld4 { v28 . 8 h , v29 . 8 h , v30 . 8 h , v31 . 8 h } , [ x4 ] , #64 / / f i l t e r [ 0 . . 7 ]
/ / Each o f b l o c k s d o e s t h e f o l l o w i n g :
/ / Extend s r c a n d f i l t e r t o 3 2 b i t s w i t h u x t l a n d s x t l
/ / multiply o r m u l t i p l y a n d a c c u m u l a t e r e s u l t s
/ / Extending t o 3 2 b i t s i s n e c e s s a r y , a s u n i t 1 6 _ t v a l u e s c a n ' t
/ / be r e p r e s e n t e d a s i n t 1 6 _ t w i t h o u t t y p e p r o m o t i o n .
uxtl v26 . 4 s , v0 . 4 h
sxtl v27 . 4 s , v28 . 4 H
uxtl2 v0 . 4 s , v0 . 8 h
mul v5 . 4 s , v26 . 4 s , v27 . 4 s
sxtl2 v28 . 4 s , v28 . 8 H
uxtl v26 . 4 s , v1 . 4 h
mul v6 . 4 s , v0 . 4 s , v28 . 4 s
sxtl v27 . 4 s , v29 . 4 H
uxtl2 v0 . 4 s , v1 . 8 h
mla v5 . 4 s , v27 . 4 s , v26 . 4 s
sxtl2 v28 . 4 s , v29 . 8 H
uxtl v26 . 4 s , v2 . 4 h
mla v6 . 4 s , v28 . 4 s , v0 . 4 s
sxtl v27 . 4 s , v30 . 4 H
uxtl2 v0 . 4 s , v2 . 8 h
mla v5 . 4 s , v27 . 4 s , v26 . 4 s
sxtl2 v28 . 4 s , v30 . 8 H
uxtl v26 . 4 s , v3 . 4 h
mla v6 . 4 s , v28 . 4 s , v0 . 4 s
sxtl v27 . 4 s , v31 . 4 H
uxtl2 v0 . 4 s , v3 . 8 h
mla v5 . 4 s , v27 . 4 s , v26 . 4 s
sxtl2 v28 . 4 s , v31 . 8 H
sub w2 , w2 , #8
mla v6 . 4 s , v28 . 4 s , v0 . 4 s
sshl v5 . 4 s , v5 . 4 s , v17 . 4 s
sshl v6 . 4 s , v6 . 4 s , v17 . 4 s
smin v5 . 4 s , v5 . 4 s , v18 . 4 s
smin v6 . 4 s , v6 . 4 s , v18 . 4 s
st1 { v5 . 4 s , v6 . 4 s } , [ x1 ] , #32
cmp w2 , #16
/ / load f i l t e r P o s i t i o n s i n t o r e g i s t e r s f o r n e x t i t e r a t i o n
ldp w8 , w9 , [ x5 ] / / f i l t e r P o s [ 0 ] , f i l t e r P o s [ 1 ]
ldp w10 , w11 , [ x5 , #8 ] / / f i l t e r P o s [ 2 ] , f i l t e r P o s [ 3 ]
ldp w12 , w13 , [ x5 , #16 ] / / f i l t e r P o s [ 4 ] , f i l t e r P o s [ 5 ]
ldp w14 , w15 , [ x5 , #24 ] / / f i l t e r P o s [ 6 ] , f i l t e r P o s [ 7 ]
add x5 , x5 , #32
lsl x8 , x8 , #1
lsl x9 , x9 , #1
lsl x10 , x10 , #1
lsl x11 , x11 , #1
lsl x12 , x12 , #1
lsl x13 , x13 , #1
lsl x14 , x14 , #1
lsl x15 , x15 , #1
ldr x8 , [ x3 , w8 , U X T W ]
ldr x9 , [ x3 , w9 , U X T W ]
ldr x10 , [ x3 , w10 , U X T W ]
ldr x11 , [ x3 , w11 , U X T W ]
ldr x12 , [ x3 , w12 , U X T W ]
ldr x13 , [ x3 , w13 , U X T W ]
ldr x14 , [ x3 , w14 , U X T W ]
ldr x15 , [ x3 , w15 , U X T W ]
stp x8 , x9 , [ s p ]
stp x10 , x11 , [ s p , #16 ]
stp x12 , x13 , [ s p , #32 ]
stp x14 , x15 , [ s p , #48 ]
b. g e 1 b
/ / here w e m a k e l a s t i t e r a t i o n , w i t h o u t u p d a t i n g t h e r e g i s t e r s
ld4 { v0 . 8 h , v1 . 8 h , v2 . 8 h , v3 . 8 h } , [ s p ]
ld4 { v28 . 8 h , v29 . 8 h , v30 . 8 h , v31 . 8 h } , [ x4 ] , #64
uxtl v26 . 4 s , v0 . 4 h
sxtl v27 . 4 s , v28 . 4 H
uxtl2 v0 . 4 s , v0 . 8 h
mul v5 . 4 s , v26 . 4 s , v27 . 4 s
sxtl2 v28 . 4 s , v28 . 8 H
uxtl v26 . 4 s , v1 . 4 h
mul v6 . 4 s , v0 . 4 s , v28 . 4 s
sxtl v27 . 4 s , v29 . 4 H
uxtl2 v0 . 4 s , v1 . 8 h
mla v5 . 4 s , v26 . 4 s , v27 . 4 s
sxtl2 v28 . 4 s , v29 . 8 H
uxtl v26 . 4 s , v2 . 4 h
mla v6 . 4 s , v0 . 4 s , v28 . 4 s
sxtl v27 . 4 s , v30 . 4 H
uxtl2 v0 . 4 s , v2 . 8 h
mla v5 . 4 s , v26 . 4 s , v27 . 4 s
sxtl2 v28 . 4 s , v30 . 8 H
uxtl v26 . 4 s , v3 . 4 h
mla v6 . 4 s , v0 . 4 s , v28 . 4 s
sxtl v27 . 4 s , v31 . 4 H
uxtl2 v0 . 4 s , v3 . 8 h
mla v5 . 4 s , v26 . 4 s , v27 . 4 s
sxtl2 v28 . 4 s , v31 . 8 H
subs w2 , w2 , #8
mla v6 . 4 s , v0 . 4 s , v28 . 4 s
sshl v5 . 4 s , v5 . 4 s , v17 . 4 s
sshl v6 . 4 s , v6 . 4 s , v17 . 4 s
smin v5 . 4 s , v5 . 4 s , v18 . 4 s
smin v6 . 4 s , v6 . 4 s , v18 . 4 s
st1 { v5 . 4 s , v6 . 4 s } , [ x1 ] , #32
add s p , s p , #64 / / r e s t o r e s t a c k
cbnz w2 , 2 f
ret
2 :
ldr w8 , [ x5 ] , #4 / / l o a d f i l t e r P o s
lsl w8 , w8 , #1
add x9 , x3 , w8 , U X T W / / s r c + f i l t e r P o s
ld1 { v0 . 4 h } , [ x9 ] / / l o a d 4 * u i n t 1 6 _ t
ld1 { v31 . 4 h } , [ x4 ] , #8
uxtl v0 . 4 s , v0 . 4 h
sxtl v31 . 4 s , v31 . 4 h
subs w2 , w2 , #1
mul v5 . 4 s , v0 . 4 s , v31 . 4 s
addv s0 , v5 . 4 S
sshl v0 . 4 s , v0 . 4 s , v17 . 4 s
smin v0 . 4 s , v0 . 4 s , v18 . 4 s
st1 { v0 . s } [ 0 ] , [ x1 ] , #4
cbnz w2 , 2 b / / i f i t e r a t i o n s r e m a i n j u m p t o b e g i n n i n g
ret
endfunc
function f f _ h s c a l e 1 6 t o 1 9 _ X 8 _ n e o n _ a s m , e x p o r t =1
/ / w0 i n t s h i f t
/ / x1 i n t 3 2 _ t * d s t
/ / w2 i n t d s t W
/ / x3 c o n s t u i n t 8 _ t * s r c / / t r e a t i t a s u i n t 1 6 _ t * s r c
/ / x4 c o n s t u i n t 1 6 _ t * f i l t e r
/ / x5 c o n s t i n t 3 2 _ t * f i l t e r P o s
/ / w6 i n t f i l t e r S i z e
movi v20 . 4 s , #1
movi v21 . 4 s , #1
shl v20 . 4 s , v20 . 4 s , #19
sub v20 . 4 s , v20 . 4 s , v21 . 4 s
dup v21 . 4 s , w0
neg v21 . 4 s , v21 . 4 s
sbfiz x7 , x6 , #1 , #32 / / f i l t e r S i z e * 2 ( * 2 b e c a u s e i n t 1 6 )
1 : ldr w8 , [ x5 ] , #4 / / f i l t e r P o s [ i d x ]
ldr w10 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 1 ]
lsl w8 , w8 , #1
ldr w11 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 2 ]
ldr w9 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 3 ]
mov x16 , x4 / / f i l t e r0 = f i l t e r
lsl w11 , w11 , #1
add x12 , x16 , x7 / / f i l t e r1 = f i l t e r0 + f i l t e r S i z e * 2
lsl w9 , w9 , #1
add x13 , x12 , x7 / / f i l t e r2 = f i l t e r1 + f i l t e r S i z e * 2
lsl w10 , w10 , #1
add x4 , x13 , x7 / / f i l t e r3 = f i l t e r2 + f i l t e r S i z e * 2
movi v0 . 2 D , #0 / / v a l s u m p a r t 1 ( f o r d s t [ 0 ] )
movi v1 . 2 D , #0 / / v a l s u m p a r t 2 ( f o r d s t [ 1 ] )
movi v2 . 2 D , #0 / / v a l s u m p a r t 3 ( f o r d s t [ 2 ] )
movi v3 . 2 D , #0 / / v a l s u m p a r t 4 ( f o r d s t [ 3 ] )
add x17 , x3 , w8 , U X T W / / s r c p + f i l t e r P o s [ 0 ]
add x8 , x3 , w10 , U X T W / / s r c p + f i l t e r P o s [ 1 ]
add x10 , x3 , w11 , U X T W / / s r c p + f i l t e r P o s [ 2 ]
add x11 , x3 , w9 , U X T W / / s r c p + f i l t e r P o s [ 3 ]
mov w15 , w6 / / f i l t e r S i z e c o u n t e r
2 : ld1 { v4 . 8 H } , [ x17 ] , #16 / / s r c p [ f i l t e r P o s [ 0 ] + { 0 . . 7 } ]
ld1 { v5 . 8 H } , [ x16 ] , #16 / / l o a d 8 x16 - b i t f i l t e r v a l u e s , p a r t 1
ld1 { v6 . 8 H } , [ x8 ] , #16 / / s r c p [ f i l t e r P o s [ 1 ] + { 0 . . 7 } ]
ld1 { v7 . 8 H } , [ x12 ] , #16 / / l o a d 8 x16 - b i t a t f i l t e r + f i l t e r S i z e
uxtl v24 . 4 s , v4 . 4 H / / e x t e n d s r c p l o w e r h a l f t o 3 2 b i t s t o p r e s e r v e s i g n
sxtl v25 . 4 s , v5 . 4 H / / e x t e n d f i l t e r l o w e r h a l f t o 3 2 b i t s t o m a t c h s r c p s i z e
uxtl2 v4 . 4 s , v4 . 8 h / / e x t e n d s r c p u p p e r h a l f t o 3 2 b i t s
mla v0 . 4 s , v24 . 4 s , v25 . 4 s / / m u l t i p l y a c c u m u l a t e l o w e r h a l f o f v4 * v5
sxtl2 v5 . 4 s , v5 . 8 h / / e x t e n d f i l t e r u p p e r h a l f t o 3 2 b i t s
uxtl v26 . 4 s , v6 . 4 h / / e x t e n d s r c p l o w e r h a l f t o 3 2 b i t s
mla v0 . 4 S , v4 . 4 s , v5 . 4 s / / m u l t i p l y a c c u m u l a t e u p p e r h a l f o f v4 * v5
sxtl v27 . 4 s , v7 . 4 H / / e x t e d f i l t e r l o w e r h a l f
uxtl2 v6 . 4 s , v6 . 8 H / / e x t e n d s r c p u p p e r h a l f
sxtl2 v7 . 4 s , v7 . 8 h / / e x t e n d f i l t e r u p p e r h a l f
ld1 { v16 . 8 H } , [ x10 ] , #16 / / s r c p [ f i l t e r P o s [ 2 ] + { 0 . . 7 } ]
mla v1 . 4 S , v26 . 4 s , v27 . 4 s / / v1 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 1 ] + { 0 . . 3 } ] * f i l t e r [ { 0 . . 3 } ]
ld1 { v17 . 8 H } , [ x13 ] , #16 / / l o a d 8 x16 - b i t a t f i l t e r + 2 * f i l t e r S i z e
uxtl v22 . 4 s , v16 . 4 H / / e x t e n d s r c p l o w e r h a l f
sxtl v23 . 4 s , v17 . 4 H / / e x t e n d f i l t e r l o w e r h a l f
uxtl2 v16 . 4 s , v16 . 8 H / / e x t e n d s r c p u p p e r h a l f
sxtl2 v17 . 4 s , v17 . 8 h / / e x t e n d f i l t e r u p p e r h a l f
mla v2 . 4 S , v22 . 4 s , v23 . 4 s / / v2 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 2 ] + { 0 . . 3 } ] * f i l t e r [ { 0 . . 3 } ]
mla v2 . 4 S , v16 . 4 s , v17 . 4 s / / v2 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 2 ] + { 4 . . 7 } ] * f i l t e r [ { 4 . . 7 } ]
ld1 { v18 . 8 H } , [ x11 ] , #16 / / s r c p [ f i l t e r P o s [ 3 ] + { 0 . . 7 } ]
mla v1 . 4 S , v6 . 4 s , v7 . 4 s / / v1 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 1 ] + { 4 . . 7 } ] * f i l t e r [ { 4 . . 7 } ]
ld1 { v19 . 8 H } , [ x4 ] , #16 / / l o a d 8 x16 - b i t a t f i l t e r + 3 * f i l t e r S i z e
subs w15 , w15 , #8 / / j - = 8 : p r o c e s s e d 8 / f i l t e r S i z e
uxtl v28 . 4 s , v18 . 4 H / / e x t e n d s r c p l o w e r h a l f
sxtl v29 . 4 s , v19 . 4 H / / e x t e n d f i l t e r l o w e r h a l f
uxtl2 v18 . 4 s , v18 . 8 H / / e x t e n d s r c p u p p e r h a l f
sxtl2 v19 . 4 s , v19 . 8 h / / e x t e n d f i l t e r u p p e r h a l f
mla v3 . 4 S , v28 . 4 s , v29 . 4 s / / v3 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 3 ] + { 0 . . 3 } ] * f i l t e r [ { 0 . . 3 } ]
mla v3 . 4 S , v18 . 4 s , v19 . 4 s / / v3 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 3 ] + { 4 . . 7 } ] * f i l t e r [ { 4 . . 7 } ]
b. g t 2 b / / i n n e r l o o p i f f i l t e r S i z e n o t c o n s u m e d c o m p l e t e l y
addp v0 . 4 S , v0 . 4 S , v1 . 4 S / / p a r t 0 1 h o r i z o n t a l p a i r a d d i n g
addp v2 . 4 S , v2 . 4 S , v3 . 4 S / / p a r t 2 3 h o r i z o n t a l p a i r a d d i n g
addp v0 . 4 S , v0 . 4 S , v2 . 4 S / / p a r t 0 1 2 3 h o r i z o n t a l p a i r a d d i n g
subs w2 , w2 , #4 / / d s t W - = 4
sshl v0 . 4 s , v0 . 4 s , v21 . 4 s / / s h i f t r i g h t ( e f f e c t i v e l y r i g t h , a s s h i f t i s n e g a t i v e ) ; overflow expected
smin v0 . 4 s , v0 . 4 s , v20 . 4 s / / a p p l y m i n ( d o n o t u s e s q s h l )
st1 { v0 . 4 s } , [ x1 ] , #16 / / w r i t e t o d e s t i n a t i o n p a r t 0 1 2 3
b. g t 1 b / / l o o p u n t i l e n d o f l i n e
ret
endfunc
function f f _ h s c a l e 1 6 t o 1 9 _ X 4 _ n e o n _ a s m , e x p o r t =1
/ / w0 i n t s h i f t
/ / x1 i n t 1 6 _ t * d s t
/ / w2 i n t d s t W
/ / x3 c o n s t u i n t 8 _ t * s r c
/ / x4 c o n s t i n t 1 6 _ t * f i l t e r
/ / x5 c o n s t i n t 3 2 _ t * f i l t e r P o s
/ / w6 i n t f i l t e r S i z e
stp d8 , d9 , [ s p , #- 0x20 ] !
stp d10 , d11 , [ s p , #0x10 ]
movi v18 . 4 s , #1
movi v17 . 4 s , #1
shl v18 . 4 s , v18 . 4 s , #19
sub v21 . 4 s , v18 . 4 s , v17 . 4 s / / m a x a l l o w e d v a l u e
dup v17 . 4 s , w0 / / r e a d s h i f t
neg v20 . 4 s , v17 . 4 s / / n e g a t e i t , s o i t c a n b e u s e d i n s s h l ( e f f e c t i v e l y s h i f t r i g h t )
lsl w7 , w6 , #1
1 :
ldp w8 , w9 , [ x5 ]
ldp w10 , w11 , [ x5 , #8 ]
movi v16 . 2 d , #0 / / i n i t i a l i z e a c c u m u l a t o r f o r i d x + 0
movi v17 . 2 d , #0 / / i n i t i a l i z e a c c u m u l a t o r f o r i d x + 1
movi v18 . 2 d , #0 / / i n i t i a l i z e a c c u m u l a t o r f o r i d x + 2
movi v19 . 2 d , #0 / / i n i t i a l i z e a c c u m u l a t o r f o r i d x + 3
mov x12 , x4 / / f i l t e r + 0
add x13 , x4 , x7 / / f i l t e r + 1
add x8 , x3 , x8 , l s l #1 / / s r c p + f i l t e r P o s 0
add x14 , x13 , x7 / / f i l t e r + 2
add x9 , x3 , x9 , l s l #1 / / s r c p + f i l t e r P o s 1
add x15 , x14 , x7 / / f i l t e r + 3
add x10 , x3 , x10 , l s l #1 / / s r c p + f i l t e r P o s 2
mov w0 , w6 / / s a v e t h e f i l t e r S i z e t o t e m p o r a r y v a r i a b l e
add x11 , x3 , x11 , l s l #1 / / s r c p + f i l t e r P o s 3
add x5 , x5 , #16 / / a d v a n c e f i l t e r p o s i t i o n
mov x16 , x z r / / c l e a r t h e r e g i s t e r x16 u s e d f o r o f f s e t t i n g t h e f i l t e r v a l u e s
2 :
ldr q4 , [ x8 ] , #16 / / l o a d s r c v a l u e s f o r i d x 0
ldr q5 , [ x9 ] , #16 / / l o a d s r c v a l u e s f o r i d x 1
uxtl v26 . 4 s , v4 . 4 h
uxtl2 v4 . 4 s , v4 . 8 h
ldr q31 , [ x12 , x16 ] / / l o a d f i l t e r v a l u e s f o r i d x 0
ldr q6 , [ x10 ] , #16 / / l o a d s r c v a l u e s f o r i d x 2
sxtl v22 . 4 s , v31 . 4 h
sxtl2 v31 . 4 s , v31 . 8 h
mla v16 . 4 s , v26 . 4 s , v22 . 4 s / / m u l t i p l i c a t i o n o f l o w e r h a l f f o r i d x 0
uxtl v25 . 4 s , v5 . 4 h
uxtl2 v5 . 4 s , v5 . 8 h
ldr q30 , [ x13 , x16 ] / / l o a d f i l t e r v a l u e s f o r i d x 1
ldr q7 , [ x11 ] , #16 / / l o a d s r c v a l u e s f o r i d x 3
mla v16 . 4 s , v4 . 4 s , v31 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 0
uxtl v24 . 4 s , v6 . 4 h
sxtl v8 . 4 s , v30 . 4 h
sxtl2 v30 . 4 s , v30 . 8 h
mla v17 . 4 s , v25 . 4 s , v8 . 4 s / / m u l t i p l i c a t i o n o f l o w e r h a l f f o r i d x 1
ldr q29 , [ x14 , x16 ] / / l o a d f i l t e r v a l u e s f o r i d x 2
uxtl2 v6 . 4 s , v6 . 8 h
sxtl v9 . 4 s , v29 . 4 h
sxtl2 v29 . 4 s , v29 . 8 h
mla v17 . 4 s , v5 . 4 s , v30 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 1
ldr q28 , [ x15 , x16 ] / / l o a d f i l t e r v a l u e s f o r i d x 3
mla v18 . 4 s , v24 . 4 s , v9 . 4 s / / m u l t i p l i c a t i o n o f l o w e r h a l f f o r i d x 2
uxtl v23 . 4 s , v7 . 4 h
sxtl v10 . 4 s , v28 . 4 h
mla v18 . 4 s , v6 . 4 s , v29 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 2
uxtl2 v7 . 4 s , v7 . 8 h
sxtl2 v28 . 4 s , v28 . 8 h
mla v19 . 4 s , v23 . 4 s , v10 . 4 s / / m u l t i p l i c a t i o n o f l o w e r h a l f f o r i d x 3
sub w0 , w0 , #8
cmp w0 , #8
mla v19 . 4 s , v7 . 4 s , v28 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 3
add x16 , x16 , #16 / / a d v a n c e f i l t e r v a l u e s i n d e x i n g
b. g e 2 b
/ / 4 iterations l e f t
sub x17 , x7 , #8 / / s t e p b a c k t o w r a p u p t h e f i l t e r p o s f o r l a s t 4 e l e m e n t s
ldr d4 , [ x8 ] / / l o a d s r c v a l u e s f o r i d x 0
ldr d31 , [ x12 , x17 ] / / l o a d f i l t e r v a l u e s f o r i d x 0
uxtl v4 . 4 s , v4 . 4 h
sxtl v31 . 4 s , v31 . 4 h
ldr d5 , [ x9 ] / / l o a d s r c v a l u e s f o r i d x 1
mla v16 . 4 s , v4 . 4 s , v31 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 0
ldr d30 , [ x13 , x17 ] / / l o a d f i l t e r v a l u e s f o r i d x 1
uxtl v5 . 4 s , v5 . 4 h
sxtl v30 . 4 s , v30 . 4 h
ldr d6 , [ x10 ] / / l o a d s r c v a l u e s f o r i d x 2
mla v17 . 4 s , v5 . 4 s , v30 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 1
ldr d29 , [ x14 , x17 ] / / l o a d f i l t e r v a l u e s f o r i d x 2
uxtl v6 . 4 s , v6 . 4 h
sxtl v29 . 4 s , v29 . 4 h
ldr d7 , [ x11 ] / / l o a d s r c v a l u e s f o r i d x 3
ldr d28 , [ x15 , x17 ] / / l o a d f i l t e r v a l u e s f o r i d x 3
mla v18 . 4 s , v6 . 4 s , v29 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 2
uxtl v7 . 4 s , v7 . 4 h
sxtl v28 . 4 s , v28 . 4 h
addp v16 . 4 s , v16 . 4 s , v17 . 4 s
mla v19 . 4 s , v7 . 4 s , v28 . 4 s / / m u l t i p l i c a t i o n o f u p p e r h a l f f o r i d x 3
subs w2 , w2 , #4
addp v18 . 4 s , v18 . 4 s , v19 . 4 s
addp v16 . 4 s , v16 . 4 s , v18 . 4 s
sshl v16 . 4 s , v16 . 4 s , v20 . 4 s
smin v16 . 4 s , v16 . 4 s , v21 . 4 s
st1 { v16 . 4 s } , [ x1 ] , #16
add x4 , x4 , x7 , l s l #2
b. g t 1 b
ldp d8 , d9 , [ s p ]
ldp d10 , d11 , [ s p , #0x10 ]
add s p , s p , #0x20
ret
endfunc