@ -25,8 +25,7 @@
# include " l i b a v u t i l / a r m / a s m . S "
/* useful constants for the algorithm, they are save in __constant_ptr__ at */
/* the end of the source code.*/
/* useful constants for the algorithm */
# define W 1 2 2 7 2 5
# define W 2 2 1 4 0 7
# define W 3 1 9 2 6 6
@ -36,16 +35,6 @@
# define W 7 4 5 2 0
# define M A S K _ M S H W 0 x F F F F 0 0 0 0
/* offsets of the constants in the vector */
# define o f f W 1 0
# define o f f W 2 4
# define o f f W 3 8
# define o f f W 4 1 2
# define o f f W 5 1 6
# define o f f W 6 2 0
# define o f f W 7 2 4
# define o f f M A S K _ M S H W 2 8
# define R O W _ S H I F T 1 1
# define R O W _ S H I F T 2 M S H W ( 1 6 - 1 1 )
# define C O L _ S H I F T 2 0
@ -63,7 +52,6 @@ function ff_simple_idct_arm, export=1
stmfd s p ! , { r4 - r11 , r14 } @ R14 is also called LR
@@ at this point, R0=block, other registers are free.
add r14 , r0 , #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
adr r12 , _ _ c o n s t a n t _ p t r _ _ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
@@ add 2 temporary variables in the stack: R0 and R14
sub s p , s p , #8 @ allow 2 local variables
str r0 , [ s p , #0 ] @ save block in sp[0]
@ -109,13 +97,13 @@ __b_evaluation:
@@ MAC16(b1, -W7, row[3]);
@@ MAC16(b2, -W1, row[3]);
@@ MAC16(b3, -W5, row[3]);
ldr r8 , [ r12 , #o f f W 1 ] @ R8=W1
ldr r8 , =W1 @ R8=W1
mov r2 , r2 , a s r #16 @ R2=ROWr16[3]
mul r0 , r8 , r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9 , [ r12 , #o f f W 3 ] @ R9=W3
ldr r10 , [ r12 , #o f f W 5 ] @ R10=W5
ldr r9 , =W3 @ R9=W3
ldr r10 , =W5 @ R10=W5
mul r1 , r9 , r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11 , [ r12 , #o f f W 7 ] @ R11=W7
ldr r11 , =W7 @ R11=W7
mul r5 , r10 , r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7 , r11 , r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2 , #0 @ if null avoid muls
@ -177,14 +165,14 @@ __a_evaluation:
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldr r9 , [ r12 , #o f f W 4 ] @ R9=W4
ldr r9 , =W4 @ R9=W4
mul r6 , r9 , r6 @ R6=W4*ROWr16[0]
ldr r10 , [ r12 , #o f f W 6 ] @ R10=W6
ldr r10 , =W6 @ R10=W6
ldrsh r4 , [ r14 , #4 ] @ R4=ROWr16[2] (a3 not defined yet)
add r6 , r6 , #R O W _ S H I F T E D _ 1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
mul r11 , r10 , r4 @ R11=W6*ROWr16[2]
ldr r8 , [ r12 , #o f f W 2 ] @ R8=W2
ldr r8 , =W2 @ R8=W2
sub r3 , r6 , r11 @ R3=a0-W6*ROWr16[2] (a2)
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
@ -248,7 +236,7 @@ __end_a_evaluation:
add r9 , r2 , r1 @ R9=a1+b1
@@ put 2 16 bits half-words in a 32bits word
@@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
ldr r10 , [ r12 , #o f f M A S K _ M S H W ] @ R 10 =0xFFFF0000
ldr r10 , =MASK_MSHW @ R10=0xFFFF0000
and r9 , r10 , r9 , l s l #R O W _ S H I F T 2 M S H W @ R9=0xFFFF0000 & ((a1+b1)<<5)
mvn r11 , r10 @ R11= NOT R10= 0x0000FFFF
and r8 , r11 , r8 , a s r #R O W _ S H I F T @ R 8 =0x0000FFFF & ( ( a0 + b0 ) > > 1 1 )
@ -319,13 +307,13 @@ __b_evaluation2:
@@ MAC16(b1, -W7, col[8x3]);
@@ MAC16(b2, -W1, col[8x3]);
@@ MAC16(b3, -W5, col[8x3]);
ldr r8 , [ r12 , #o f f W 1 ] @ R8=W1
ldr r8 , =W1 @ R8=W1
ldrsh r7 , [ r14 , #16 ]
mul r0 , r8 , r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9 , [ r12 , #o f f W 3 ] @ R9=W3
ldr r10 , [ r12 , #o f f W 5 ] @ R10=W5
ldr r9 , =W3 @ R9=W3
ldr r10 , =W5 @ R10=W5
mul r1 , r9 , r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11 , [ r12 , #o f f W 7 ] @ R11=W7
ldr r11 , =W7 @ R11=W7
mul r5 , r10 , r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldrsh r2 , [ r14 , #48 ]
mul r7 , r11 , r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
@ -381,13 +369,13 @@ __a_evaluation2:
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldrsh r6 , [ r14 , #0 ]
ldr r9 , [ r12 , #o f f W 4 ] @ R9=W4
ldr r9 , =W4 @ R9=W4
mul r6 , r9 , r6 @ R6=W4*ROWr16[0]
ldr r10 , [ r12 , #o f f W 6 ] @ R10=W6
ldr r10 , =W6 @ R10=W6
ldrsh r4 , [ r14 , #32 ] @ R4=ROWr16[2] (a3 not defined yet)
add r6 , r6 , #C O L _ S H I F T E D _ 1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
mul r11 , r10 , r4 @ R11=W6*ROWr16[2]
ldr r8 , [ r12 , #o f f W 2 ] @ R8=W2
ldr r8 , =W2 @ R8=W2
add r2 , r6 , r11 @ R2=a0+W6*ROWr16[2] (a1)
sub r3 , r6 , r11 @ R3=a0-W6*ROWr16[2] (a2)
mul r11 , r8 , r4 @ R11=W2*ROWr16[2]
@ -489,15 +477,3 @@ __end_bef_a_evaluation:
sub r4 , r6 , r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6 , r6 , r11 @ R6=a0+W2*ROWr16[2] (a0)
bal _ _ e n d _ a _ e v a l u a t i o n
.align
__constant_ptr__ : @@ see #defines at the beginning of the source code for values.
.word W1
.word W2
.word W3
.word W4
.word W5
.word W6
.word W7
.word MASK_MSHW