@ -22,7 +22,7 @@
function f f _ h s c a l e _ 8 _ t o _ 1 5 _ n e o n , e x p o r t =1
function f f _ h s c a l e _ 8 _ t o _ 1 5 _ n e o n , e x p o r t =1
sbfiz x7 , x6 , #1 , #32 / / f i l t e r S i z e * 2 ( * 2 b e c a u s e i n t 1 6 )
sbfiz x7 , x6 , #1 , #32 / / f i l t e r S i z e * 2 ( * 2 b e c a u s e i n t 1 6 )
1 : ldr w1 8 , [ x5 ] , #4 / / f i l t e r P o s [ i d x ]
1 : ldr w8 , [ x5 ] , #4 / / f i l t e r P o s [ i d x ]
ldr w0 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 1 ]
ldr w0 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 1 ]
ldr w11 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 2 ]
ldr w11 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 2 ]
ldr w9 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 3 ]
ldr w9 , [ x5 ] , #4 / / f i l t e r P o s [ i d x + 3 ]
@ -34,14 +34,14 @@ function ff_hscale_8_to_15_neon, export=1
movi v1 . 2 D , #0 / / v a l s u m p a r t 2 ( f o r d s t [ 1 ] )
movi v1 . 2 D , #0 / / v a l s u m p a r t 2 ( f o r d s t [ 1 ] )
movi v2 . 2 D , #0 / / v a l s u m p a r t 3 ( f o r d s t [ 2 ] )
movi v2 . 2 D , #0 / / v a l s u m p a r t 3 ( f o r d s t [ 2 ] )
movi v3 . 2 D , #0 / / v a l s u m p a r t 4 ( f o r d s t [ 3 ] )
movi v3 . 2 D , #0 / / v a l s u m p a r t 4 ( f o r d s t [ 3 ] )
add x17 , x3 , w1 8 , U X T W / / s r c p + f i l t e r P o s [ 0 ]
add x17 , x3 , w8 , U X T W / / s r c p + f i l t e r P o s [ 0 ]
add x1 8 , x3 , w0 , U X T W / / s r c p + f i l t e r P o s [ 1 ]
add x8 , x3 , w0 , U X T W / / s r c p + f i l t e r P o s [ 1 ]
add x0 , x3 , w11 , U X T W / / s r c p + f i l t e r P o s [ 2 ]
add x0 , x3 , w11 , U X T W / / s r c p + f i l t e r P o s [ 2 ]
add x11 , x3 , w9 , U X T W / / s r c p + f i l t e r P o s [ 3 ]
add x11 , x3 , w9 , U X T W / / s r c p + f i l t e r P o s [ 3 ]
mov w15 , w6 / / f i l t e r S i z e c o u n t e r
mov w15 , w6 / / f i l t e r S i z e c o u n t e r
2 : ld1 { v4 . 8 B } , [ x17 ] , #8 / / s r c p [ f i l t e r P o s [ 0 ] + { 0 . . 7 } ]
2 : ld1 { v4 . 8 B } , [ x17 ] , #8 / / s r c p [ f i l t e r P o s [ 0 ] + { 0 . . 7 } ]
ld1 { v5 . 8 H } , [ x16 ] , #16 / / l o a d 8 x16 - b i t f i l t e r v a l u e s , p a r t 1
ld1 { v5 . 8 H } , [ x16 ] , #16 / / l o a d 8 x16 - b i t f i l t e r v a l u e s , p a r t 1
ld1 { v6 . 8 B } , [ x1 8 ] , #8 / / s r c p [ f i l t e r P o s [ 1 ] + { 0 . . 7 } ]
ld1 { v6 . 8 B } , [ x8 ] , #8 / / s r c p [ f i l t e r P o s [ 1 ] + { 0 . . 7 } ]
ld1 { v7 . 8 H } , [ x12 ] , #16 / / l o a d 8 x16 - b i t a t f i l t e r + f i l t e r S i z e
ld1 { v7 . 8 H } , [ x12 ] , #16 / / l o a d 8 x16 - b i t a t f i l t e r + f i l t e r S i z e
uxtl v4 . 8 H , v4 . 8 B / / u n p a c k p a r t 1 t o 1 6 - b i t
uxtl v4 . 8 H , v4 . 8 B / / u n p a c k p a r t 1 t o 1 6 - b i t
smlal v0 . 4 S , v4 . 4 H , v5 . 4 H / / v0 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 0 ] + { 0 . . 3 } ] * f i l t e r [ { 0 . . 3 } ]
smlal v0 . 4 S , v4 . 4 H , v5 . 4 H / / v0 a c c u m u l a t e s s r c p [ f i l t e r P o s [ 0 ] + { 0 . . 3 } ] * f i l t e r [ { 0 . . 3 } ]