@ -1,5 +1,7 @@
/ *
/ *
* Copyright ( c ) 2 0 1 6 C l é m e n t B œ s c h < c l e m e n t s t u p e f l i x . c o m >
* Copyright ( c ) 2 0 1 6 C l é m e n t B œ s c h < c l e m e n t s t u p e f l i x . c o m >
* Copyright ( c ) 2 0 1 9 - 2 0 2 1 S e b a s t i a n P o p < s p o p @amazon.com>
* Copyright ( c ) 2 0 2 2 J o n a t h a n S w i n n e y < j s w i n n e y @amazon.com>
*
*
* This f i l e i s p a r t o f F F m p e g .
* This f i l e i s p a r t o f F F m p e g .
*
*
@ -20,7 +22,25 @@
# include " l i b a v u t i l / a a r c h64 / a s m . S "
# include " l i b a v u t i l / a a r c h64 / a s m . S "
function f f _ h s c a l e _ 8 _ t o _ 1 5 _ n e o n , e x p o r t =1
/ *
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bit width or 16-bit width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;----------------------------------------------------------------------------- */
function f f _ h s c a l e 8 t o 1 5 _ X 8 _ n e o n , e x p o r t =1
sbfiz x7 , x6 , #1 , #32 / / f i l t e r S i z e * 2 ( * 2 b e c a u s e i n t 1 6 )
sbfiz x7 , x6 , #1 , #32 / / f i l t e r S i z e * 2 ( * 2 b e c a u s e i n t 1 6 )
1 : ldr w8 , [ x5 ] , #4 / / f i l t e r P o s [ i d x ]
1 : ldr w8 , [ x5 ] , #4 / / f i l t e r P o s [ i d x ]
ldr w0 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 1 ]
ldr w0 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 1 ]
@ -70,3 +90,153 @@ function ff_hscale_8_to_15_neon, export=1
b. g t 1 b / / l o o p u n t i l e n d o f l i n e
b. g t 1 b / / l o o p u n t i l e n d o f l i n e
ret
ret
endfunc
endfunc
function f f _ h s c a l e 8 t o 1 5 _ 4 _ n e o n , e x p o r t =1
/ / x0 S w s C o n t e x t * c ( n o t u s e d )
/ / x1 i n t 1 6 _ t * d s t
/ / x2 i n t d s t W
/ / x3 c o n s t u i n t 8 _ t * s r c
/ / x4 c o n s t i n t 1 6 _ t * f i l t e r
/ / x5 c o n s t i n t 3 2 _ t * f i l t e r P o s
/ / x6 i n t f i l t e r S i z e
/ / x8 - x15 r e g i s t e r s f o r g a t h e r i n g s r c d a t a
/ / v0 m a d d a c c u m u l a t o r 4 S
/ / v1 - v4 f i l t e r v a l u e s ( 1 6 b i t ) 8 H
/ / v5 m a d d a c c u m u l a t o r 4 S
/ / v1 6 - v19 s r c v a l u e s ( 8 b i t ) 8 B
/ / This i m p l e m e n t a t i o n h a s 4 s e c t i o n s :
/ / 1 . Prefetch s r c d a t a
/ / 2 . Interleaved p r e f e t c h i n g s r c d a t a a n d m a d d
/ / 3 . Complete m a d d
/ / 4 . Complete r e m a i n i n g i t e r a t i o n s w h e n d s t W % 8 ! = 0
sub s p , s p , #32 / / a l l o c a t e 3 2 b y t e s o n t h e s t a c k
cmp w2 , #16 / / i f d s t W < 1 6 , s k i p t o t h e l a s t b l o c k u s e d f o r w r a p p i n g u p
b. l t 2 f
/ / load 8 v a l u e s f r o m f i l t e r P o s t o b e u s e d a s o f f s e t s i n t o s r c
ldp w8 , w9 , [ x5 ] / / f i l t e r P o s [ i d x + 0 ] , [ i d x + 1 ]
ldp w10 , w11 , [ x5 , #8 ] / / f i l t e r P o s [ i d x + 2 ] , [ i d x + 3 ]
ldp w12 , w13 , [ x5 , #16 ] / / f i l t e r P o s [ i d x + 4 ] , [ i d x + 5 ]
ldp w14 , w15 , [ x5 , #24 ] / / f i l t e r P o s [ i d x + 6 ] , [ i d x + 7 ]
add x5 , x5 , #32 / / a d v a n c e f i l t e r P o s
/ / gather r a n d o m a c c e s s d a t a f r o m s r c i n t o c o n t i g u o u s m e m o r y
ldr w8 , [ x3 , w8 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 0 ] ] [ 0 . . 3 ]
ldr w9 , [ x3 , w9 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 1 ] ] [ 0 . . 3 ]
ldr w10 , [ x3 , w10 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 2 ] ] [ 0 . . 3 ]
ldr w11 , [ x3 , w11 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 3 ] ] [ 0 . . 3 ]
ldr w12 , [ x3 , w12 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 4 ] ] [ 0 . . 3 ]
ldr w13 , [ x3 , w13 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 5 ] ] [ 0 . . 3 ]
ldr w14 , [ x3 , w14 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 6 ] ] [ 0 . . 3 ]
ldr w15 , [ x3 , w15 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 7 ] ] [ 0 . . 3 ]
stp w8 , w9 , [ s p ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 0 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 1 ] ] [ 0 . . 3 ] }
stp w10 , w11 , [ s p , #8 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 2 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 3 ] ] [ 0 . . 3 ] }
stp w12 , w13 , [ s p , #16 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 4 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 5 ] ] [ 0 . . 3 ] }
stp w14 , w15 , [ s p , #24 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 6 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 7 ] ] [ 0 . . 3 ] }
1 :
ld4 { v16 . 8 B , v17 . 8 B , v18 . 8 B , v19 . 8 B } , [ s p ] / / t r a n s p o s e 8 b y t e s e a c h f r o m s r c i n t o 4 r e g i s t e r s
/ / load 8 v a l u e s f r o m f i l t e r P o s t o b e u s e d a s o f f s e t s i n t o s r c
ldp w8 , w9 , [ x5 ] / / f i l t e r P o s [ i d x + 0 ] [ 0 . . 3 ] , [ i d x + 1 ] [ 0 . . 3 ] , n e x t i t e r a t i o n
ldp w10 , w11 , [ x5 , #8 ] / / f i l t e r P o s [ i d x + 2 ] [ 0 . . 3 ] , [ i d x + 3 ] [ 0 . . 3 ] , n e x t i t e r a t i o n
ldp w12 , w13 , [ x5 , #16 ] / / f i l t e r P o s [ i d x + 4 ] [ 0 . . 3 ] , [ i d x + 5 ] [ 0 . . 3 ] , n e x t i t e r a t i o n
ldp w14 , w15 , [ x5 , #24 ] / / f i l t e r P o s [ i d x + 6 ] [ 0 . . 3 ] , [ i d x + 7 ] [ 0 . . 3 ] , n e x t i t e r a t i o n
movi v0 . 2 D , #0 / / C l e a r m a d d a c c u m u l a t o r f o r i d x 0 . . 3
movi v5 . 2 D , #0 / / C l e a r m a d d a c c u m u l a t o r f o r i d x 4 . . 7
ld4 { v1 . 8 H , v2 . 8 H , v3 . 8 H , v4 . 8 H } , [ x4 ] , #64 / / l o a d f i l t e r i d x + 0 . . 7
add x5 , x5 , #32 / / a d v a n c e f i l t e r P o s
/ / interleaved S I M D a n d p r e f e t c h i n g i n t e n d e d t o k e e p l d / s t a n d v e c t o r p i p e l i n e s b u s y
uxtl v16 . 8 H , v16 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
uxtl v17 . 8 H , v17 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
ldr w8 , [ x3 , w8 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 0 ] ] , n e x t i t e r a t i o n
ldr w9 , [ x3 , w9 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 1 ] ] , n e x t i t e r a t i o n
uxtl v18 . 8 H , v18 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
uxtl v19 . 8 H , v19 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
ldr w10 , [ x3 , w10 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 2 ] ] , n e x t i t e r a t i o n
ldr w11 , [ x3 , w11 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 3 ] ] , n e x t i t e r a t i o n
smlal v0 . 4 S , v1 . 4 H , v16 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 0 , i d x = 0 . . 3
smlal v0 . 4 S , v2 . 4 H , v17 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 1 , i d x = 0 . . 3
ldr w12 , [ x3 , w12 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 4 ] ] , n e x t i t e r a t i o n
ldr w13 , [ x3 , w13 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 5 ] ] , n e x t i t e r a t i o n
smlal v0 . 4 S , v3 . 4 H , v18 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 2 , i d x = 0 . . 3
smlal v0 . 4 S , v4 . 4 H , v19 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 3 , i d x = 0 . . 3
ldr w14 , [ x3 , w14 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 6 ] ] , n e x t i t e r a t i o n
ldr w15 , [ x3 , w15 , U X T W ] / / s r c [ f i l t e r P o s [ i d x + 7 ] ] , n e x t i t e r a t i o n
smlal2 v5 . 4 S , v1 . 8 H , v16 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 0 , i d x = 4 . . 7
smlal2 v5 . 4 S , v2 . 8 H , v17 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 1 , i d x = 4 . . 7
stp w8 , w9 , [ s p ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 0 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 1 ] ] [ 0 . . 3 ] }
stp w10 , w11 , [ s p , #8 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 2 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 3 ] ] [ 0 . . 3 ] }
smlal2 v5 . 4 S , v3 . 8 H , v18 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 2 , i d x = 4 . . 7
smlal2 v5 . 4 S , v4 . 8 H , v19 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 3 , i d x = 4 . . 7
stp w12 , w13 , [ s p , #16 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 4 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 5 ] ] [ 0 . . 3 ] }
stp w14 , w15 , [ s p , #24 ] / / * s c r a t c h _ m e m = { s r c [ f i l t e r P o s [ i d x + 6 ] ] [ 0 . . 3 ] , s r c [ f i l t e r P o s [ i d x + 7 ] ] [ 0 . . 3 ] }
sub w2 , w2 , #8 / / d s t W - = 8
sqshrn v0 . 4 H , v0 . 4 S , #7 / / s h i f t a n d c l i p t h e 2 x16 - b i t f i n a l v a l u e s
sqshrn v1 . 4 H , v5 . 4 S , #7 / / s h i f t a n d c l i p t h e 2 x16 - b i t f i n a l v a l u e s
st1 { v0 . 4 H , v1 . 4 H } , [ x1 ] , #16 / / w r i t e t o d s t [ i d x + 0 . . 7 ]
cmp w2 , #16 / / c o n t i n u e o n m a i n l o o p i f t h e r e a r e a t l e a s t 1 6 i t e r a t i o n s l e f t
b. g e 1 b
/ / last f u l l i t e r a t i o n
ld4 { v16 . 8 B , v17 . 8 B , v18 . 8 B , v19 . 8 B } , [ s p ]
ld4 { v1 . 8 H , v2 . 8 H , v3 . 8 H , v4 . 8 H } , [ x4 ] , #64 / / l o a d f i l t e r i d x + 0 . . 7
movi v0 . 2 D , #0 / / C l e a r m a d d a c c u m u l a t o r f o r i d x 0 . . 3
movi v5 . 2 D , #0 / / C l e a r m a d d a c c u m u l a t o r f o r i d x 4 . . 7
uxtl v16 . 8 H , v16 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
uxtl v17 . 8 H , v17 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
uxtl v18 . 8 H , v18 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
uxtl v19 . 8 H , v19 . 8 B / / u n s i g n e d e x t e n d l o n g , c o v e r t s r c d a t a t o 1 6 - b i t
smlal v0 . 4 S , v1 . 4 H , v16 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 0 , i d x = 0 . . 3
smlal v0 . 4 S , v2 . 4 H , v17 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 1 , i d x = 0 . . 3
smlal v0 . 4 S , v3 . 4 H , v18 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 2 , i d x = 0 . . 3
smlal v0 . 4 S , v4 . 4 H , v19 . 4 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 3 , i d x = 0 . . 3
smlal2 v5 . 4 S , v1 . 8 H , v16 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 0 , i d x = 4 . . 7
smlal2 v5 . 4 S , v2 . 8 H , v17 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 1 , i d x = 4 . . 7
smlal2 v5 . 4 S , v3 . 8 H , v18 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 2 , i d x = 4 . . 7
smlal2 v5 . 4 S , v4 . 8 H , v19 . 8 H / / m u l t i p l y a c c u m u l a t e i n n e r l o o p j = 3 , i d x = 4 . . 7
subs w2 , w2 , #8 / / d s t W - = 8
sqshrn v0 . 4 H , v0 . 4 S , #7 / / s h i f t a n d c l i p t h e 2 x16 - b i t f i n a l v a l u e s
sqshrn v1 . 4 H , v5 . 4 S , #7 / / s h i f t a n d c l i p t h e 2 x16 - b i t f i n a l v a l u e s
st1 { v0 . 4 H , v1 . 4 H } , [ x1 ] , #16 / / w r i t e t o d s t [ i d x + 0 . . 7 ]
cbnz w2 , 2 f / / i f > 0 i t e r a t i o n s r e m a i n , j u m p t o t h e w r a p u p s e c t i o n
add s p , s p , #32 / / c l e a n u p s t a c k
ret
/ / finish u p w h e n d s t W % 8 ! = 0 o r d s t W < 1 6
2 :
/ / load s r c
ldr w8 , [ x5 ] , #4 / / f i l t e r P o s [ i ]
add x9 , x3 , w8 , U X T W / / c a l c u l a t e t h e a d d r e s s f o r s r c l o a d
ld1 { v5 . S } [ 0 ] , [ x9 ] / / s r c [ f i l t e r P o s [ i ] + 0 . . 3 ]
/ / load f i l t e r
ld1 { v6 . 4 H } , [ x4 ] , #8 / / f i l t e r [ f i l t e r S i z e * i + 0 . . 3 ]
uxtl v5 . 8 H , v5 . 8 B / / u n s i g n e d e x t e n l o n g , c o n v e r t s r c d a t a t o 1 6 - b i t
smull v0 . 4 S , v5 . 4 H , v6 . 4 H / / 4 i t e r a t i o n s o f s r c [ . . . ] * f i l t e r [ . . . ]
addv s0 , v0 . 4 S / / a d d u p p r o d u c t s o f s r c a n d f i l t e r v a l u e s
sqshrn h0 , s0 , #7 / / s h i f t a n d c l i p t h e 2 x16 - b i t f i n a l v a l u e
st1 { v0 . H } [ 0 ] , [ x1 ] , #2 / / d s t [ i ] = . . .
sub w2 , w2 , #1 / / d s t W - -
cbnz w2 , 2 b
add s p , s p , #32 / / c l e a n u p s t a c k
ret
endfunc