@ -154,6 +154,209 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
.hword 5 0 7 7 , 9 8 1 / / sp[ 0 ] = v0 . h [ 6 ]
.hword 5 0 7 7 , 9 8 1 / / sp[ 0 ] = v0 . h [ 6 ]
endconst
endconst
/ / = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
/ /
/ / void f i l t e r _ l i n e (
/ / void * d s t 1 , / / x0
/ / void * p r e v1 , / / x1
/ / void * c u r1 , / / x2
/ / void * n e x t 1 , / / x3
/ / int w , / / w4
/ / int p r e f s , / / w5
/ / int m r e f s , / / w6
/ / int p r e f s2 , / / w7
/ / int m r e f s2 , / / [ s p , #0 ]
/ / int p r e f s3 , / / [ s p , #S P _ I N T ]
/ / int m r e f s3 , / / [ s p , #S P _ I N T * 2 ]
/ / int p r e f s4 , / / [ s p , #S P _ I N T * 3 ]
/ / int m r e f s4 , / / [ s p , #S P _ I N T * 4 ]
/ / int p a r i t y , / / [ s p , #S P _ I N T * 5 ]
/ / int c l i p _ m a x ) / / [ s p , #S P _ I N T * 6 ]
function f f _ b w d i f _ f i l t e r _ l i n e _ n e o n , e x p o r t =1
/ / Sanity c h e c k w
cmp w4 , #0
ble 9 9 f
/ / Rearrange r e g s t o b e t h e s a m e a s l i n e 3 f o r e a s e o f d e b u g !
mov w10 , w4 / / w10 = l o o p c o u n t
mov w9 , w6 / / w9 = m r e f
mov w12 , w7 / / w12 = p r e f2
mov w11 , w5 / / w11 = p r e f
ldr w8 , [ s p , #0 ] / / w8 = m r e f2
ldr w7 , [ s p , #S P _ I N T * 2 ] / / w7 = m r e f3
ldr w6 , [ s p , #S P _ I N T * 4 ] / / w6 = m r e f4
ldr w13 , [ s p , #S P _ I N T ] / / w 13 = p r e f3
ldr w14 , [ s p , #S P _ I N T * 3 ] / / w14 = p r e f4
mov x4 , x3
mov x3 , x2
mov x2 , x1
LDR_ C O E F F S v0 , x17
/ / # define p r e v2 c u r
/ / const u i n t 8 _ t * r e s t r i c t n e x t 2 = p a r i t y ? p r e v : n e x t ;
ldr w17 , [ s p , #S P _ I N T * 5 ] / / p a r i t y
cmp w17 , #0
csel x17 , x2 , x4 , n e
PUSH_ V R E G S
/ / for ( x = 0 ; x < w; x++) {
/ / int d i f f0 , d i f f2 ;
/ / int d0 , d2 ;
/ / int t e m p o r a l _ d i f f0 , t e m p o r a l _ d i f f2 ;
/ /
/ / int i 1 , i 2 ;
/ / int j 1 , j 2 ;
/ / int p6 , p5 , p4 , p3 , p2 , p1 , c0 , m 1 , m 2 , m 3 , m 4 ;
10 :
/ / c0 = p r e v2 [ 0 ] + n e x t 2 [ 0 ] ; // c0 = v20, v21
/ / d0 = c0 > > 1 ; // d0 = v10
/ / temporal_ d i f f0 = F F A B S ( p r e v2 [ 0 ] - n e x t 2 [ 0 ] ) ; // td0 = v11
ldr q31 , [ x3 ]
ldr q21 , [ x17 ]
uhadd v10 . 1 6 b , v31 . 1 6 b , v21 . 1 6 b
uabd v11 . 1 6 b , v31 . 1 6 b , v21 . 1 6 b
uaddl v20 . 8 h , v21 . 8 b , v31 . 8 b
uaddl2 v21 . 8 h , v21 . 1 6 b , v31 . 1 6 b
ldr q31 , [ x3 , w6 , s x t w ]
ldr q23 , [ x17 , w6 , s x t w ]
/ / i1 = c o e f _ h f [ 0 ] * c0 ; // i1 = v2-v5
UMULL4 K v2 , v3 , v4 , v5 , v20 , v21 , v0 . h [ 2 ]
ldr q30 , [ x3 , w14 , s x t w ]
ldr q25 , [ x17 , w14 , s x t w ]
/ / m4 = p r e v2 [ m r e f s4 ] + n e x t 2 [ m r e f s4 ] ; // m4 = v22,v23
uaddl v22 . 8 h , v23 . 8 b , v31 . 8 b
uaddl2 v23 . 8 h , v23 . 1 6 b , v31 . 1 6 b
/ / p4 = p r e v2 [ p r e f s4 ] + n e x t 2 [ p r e f s4 ] ; // p4 = v24,v25, (p4 >> 1) = v12
uhadd v12 . 1 6 b , v25 . 1 6 b , v30 . 1 6 b
uaddl v24 . 8 h , v25 . 8 b , v30 . 8 b
uaddl2 v25 . 8 h , v25 . 1 6 b , v30 . 1 6 b
/ / m3 = c u r [ m r e f s3 ] ; // m3 = v20
ldr q20 , [ x3 , w7 , s x t w ]
/ / p3 = c u r [ p r e f s3 ] ; // p3 = v21
ldr q21 , [ x3 , w13 , s x t w ]
/ / i1 + = c o e f _ h f [ 2 ] * ( m 4 + p4 ) ; // (-m4:v22,v23) (-p4:v24,v25)
add v22 . 8 h , v22 . 8 h , v24 . 8 h
add v23 . 8 h , v23 . 8 h , v25 . 8 h
UMLAL4 K v2 , v3 , v4 , v5 , v22 , v23 , v0 . h [ 4 ]
ldr q29 , [ x3 , w8 , s x t w ]
ldr q23 , [ x17 , w8 , s x t w ]
/ / i1 - = c o e f _ l f [ 1 ] * 4 * ( m 3 + p3 ) ; // -
uaddl v30 . 8 h , v20 . 8 b , v21 . 8 b
uaddl2 v31 . 8 h , v20 . 1 6 b , v21 . 1 6 b
UMLSL4 K v2 , v3 , v4 , v5 , v30 , v31 , v0 . h [ 1 ]
ldr q31 , [ x3 , w12 , s x t w ]
ldr q27 , [ x17 , w12 , s x t w ]
/ / m2 = p r e v2 [ m r e f s2 ] + n e x t 2 [ m r e f s2 ] ; // m2 = v22,v23, (m2 >> 1) = v13
uhadd v13 . 1 6 b , v23 . 1 6 b , v29 . 1 6 b
uaddl v22 . 8 h , v23 . 8 b , v29 . 8 b
uaddl2 v23 . 8 h , v23 . 1 6 b , v29 . 1 6 b
/ / m1 = c u r [ m r e f s ] ; // m1 = v24
ldr q24 , [ x3 , w9 , s x t w ]
/ / p2 = p r e v2 [ p r e f s2 ] + n e x t 2 [ p r e f s2 ] ; // p2 = v26, v27
/ / temporal_ d i f f2 = F F A B S ( p r e v2 [ p r e f s2 ] - n e x t 2 [ p r e f s2 ] ) ; // td2 = v14
/ / d2 = p2 > > 1 ; // d2 = v15
uabd v14 . 1 6 b , v31 . 1 6 b , v27 . 1 6 b
uhadd v15 . 1 6 b , v31 . 1 6 b , v27 . 1 6 b
uaddl v26 . 8 h , v27 . 8 b , v31 . 8 b
uaddl2 v27 . 8 h , v27 . 1 6 b , v31 . 1 6 b
/ / i1 - = c o e f _ h f [ 1 ] * ( m 2 + p2 ) ; // (-m2:v22,v23*) (-p2:v26*,v27*)
add v22 . 8 h , v22 . 8 h , v26 . 8 h
add v23 . 8 h , v23 . 8 h , v27 . 8 h
UMLSL4 K v2 , v3 , v4 , v5 , v22 , v23 , v0 . h [ 3 ]
/ / p1 = c u r [ p r e f s ] ; // p1 = v22
ldr q22 , [ x3 , w11 , s x t w ]
/ / i2 = ( c o e f _ s p [ 0 ] * ( m 1 + p1 ) - c o e f _ s p [ 1 ] * ( m 3 + p3 ) ) > > 1 3 ; // (-m3:v20*) i2=v17
uaddl v18 . 8 h , v22 . 8 b , v24 . 8 b
uaddl2 v19 . 8 h , v22 . 1 6 b , v24 . 1 6 b
UMULL4 K v28 , v29 , v30 , v31 , v18 , v19 , v0 . h [ 6 ]
uaddl v18 . 8 h , v20 . 8 b , v21 . 8 b
uaddl2 v19 . 8 h , v20 . 1 6 b , v21 . 1 6 b
UMLSL4 K v28 , v29 , v30 , v31 , v18 , v19 , v0 . h [ 7 ]
SQSHRUNN v17 , v28 , v29 , v30 , v31 , 1 3
/ / i1 + = c o e f _ l f [ 0 ] * 4 * ( m 1 + p1 ) ; // p1 = v22, m1 = v24
uaddl v26 . 8 h , v24 . 8 b , v22 . 8 b
uaddl2 v27 . 8 h , v24 . 1 6 b , v22 . 1 6 b
UMLAL4 K v2 , v3 , v4 , v5 , v26 , v27 , v0 . h [ 0 ]
ldr q31 , [ x2 , w9 , s x t w ]
ldr q29 , [ x4 , w9 , s x t w ]
ldr q30 , [ x2 , w11 , s x t w ]
ldr q28 , [ x4 , w11 , s x t w ]
/ / i1 > > = 1 5 ; // i1 = v2, -v3, -v4*, -v5*
SQSHRUNN v2 , v2 , v3 , v4 , v5 , 1 5
/ / {
/ / int t 1 = ( F F A B S ( p r e v [ m r e f s ] - m 1 ) + F F A B S ( p r e v [ p r e f s ] - p1 ) ) > > 1 ;
/ / int t 2 = ( F F A B S ( n e x t [ m r e f s ] - m 1 ) + F F A B S ( n e x t [ p r e f s ] - p1 ) ) > > 1 ;
uabd v30 . 1 6 b , v22 . 1 6 b , v30 . 1 6 b
uabd v31 . 1 6 b , v24 . 1 6 b , v31 . 1 6 b
uabd v28 . 1 6 b , v22 . 1 6 b , v28 . 1 6 b
uabd v29 . 1 6 b , v24 . 1 6 b , v29 . 1 6 b
uhadd v31 . 1 6 b , v31 . 1 6 b , v30 . 1 6 b
uhadd v29 . 1 6 b , v29 . 1 6 b , v28 . 1 6 b
/ / diff0 = F F M A X 3 ( t e m p o r a l _ d i f f0 > > 1 , t 1 , t 2 ) ; // diff0=v18
ushr v18 . 1 6 b , v11 . 1 6 b , #1
umax v18 . 1 6 b , v18 . 1 6 b , v31 . 1 6 b
umax v18 . 1 6 b , v18 . 1 6 b , v29 . 1 6 b
/ / diff0 = v18 , ( m 2 > > 1 ) = v13 , m 1 = v24 , d0 = v10 , p1 = v22 , d2 = v15
SPAT_ C H E C K v18 , v13 , v24 , v10 , v22 , v15 , v31 , v30 , v29 , v28
/ / i1 = v2 , i 2 = v17 , m 1 = v24 , d0 = v10 , p1 = v22 , t d2 = v11 , d i f f2 = v18
INTERPOL v2 , v2 , v17 , v24 , v10 , v22 , v11 , v18 , v31 , v30 , v29
/ / dst[ 0 ] = a v _ c l i p _ u i n t 8 ( i n t e r p o l ) ;
str q2 , [ x0 ] , #16
/ / }
/ /
/ / dst+ + ;
/ / cur+ + ;
/ / prev+ + ;
/ / prev2 + + ;
/ / next+ + ;
/ / }
subs w10 , w10 , #16
add x2 , x2 , #16
add x3 , x3 , #16
add x4 , x4 , #16
add x17 , x17 , #16
bgt 1 0 b
POP_ V R E G S
99 :
ret
endfunc
/ / = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
/ / = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
/ /
/ /
/ / void f f _ b w d i f _ f i l t e r _ e d g e _ n e o n (
/ / void f f _ b w d i f _ f i l t e r _ e d g e _ n e o n (