/ *
* Copyright ( C ) 2 0 0 2 F r e d e r i c ' d i l b ' B o u l a y
*
* Author : Frederic B o u l a y < d i l b @handhelds.org>
*
* The f u n c t i o n d e f i n e d i n t h i s f i l e i s d e r i v e d f r o m t h e s i m p l e _ i d c t f u n c t i o n
* from t h e l i b a v c o d e c l i b r a r y p a r t o f t h e F F m p e g p r o j e c t .
*
* This f i l e i s p a r t o f F F m p e g .
*
* FFmpeg i s f r e e s o f t w a r e ; you can redistribute it and/or
* modify i t u n d e r t h e t e r m s o f t h e G N U L e s s e r G e n e r a l P u b l i c
* License a s p u b l i s h e d b y t h e F r e e S o f t w a r e F o u n d a t i o n ; either
* version 2 . 1 o f t h e L i c e n s e , o r ( a t y o u r o p t i o n ) a n y l a t e r v e r s i o n .
*
* FFmpeg i s d i s t r i b u t e d i n t h e h o p e t h a t i t w i l l b e u s e f u l ,
* but W I T H O U T A N Y W A R R A N T Y ; without even the implied warranty of
* MERCHANTABILITY o r F I T N E S S F O R A P A R T I C U L A R P U R P O S E . S e e t h e G N U
* Lesser G e n e r a l P u b l i c L i c e n s e f o r m o r e d e t a i l s .
*
* You s h o u l d h a v e r e c e i v e d a c o p y o f t h e G N U L e s s e r G e n e r a l P u b l i c
* License a l o n g w i t h F F m p e g ; if not, write to the Free Software
* Foundation, I n c . , 5 1 F r a n k l i n S t r e e t , F i f t h F l o o r , B o s t o n , M A 0 2 1 1 0 - 1 3 0 1 U S A
* /
# include " l i b a v u t i l / a r m / a s m . S "
/* useful constants for the algorithm */
# define W 1 2 2 7 2 5
# define W 2 2 1 4 0 7
# define W 3 1 9 2 6 6
# define W 4 1 6 3 8 3
# define W 5 1 2 8 7 3
# define W 6 8 8 6 7
# define W 7 4 5 2 0
# define M A S K _ M S H W 0 x F F F F 0 0 0 0
# define R O W _ S H I F T 1 1
# define R O W _ S H I F T 2 M S H W ( 1 6 - 1 1 )
# define C O L _ S H I F T 2 0
# define R O W _ S H I F T E D _ 1 1 0 2 4 / * 1 < < ( R O W _ S H I F T - 1 ) * /
# define C O L _ S H I F T E D _ 1 5 2 4 2 8 8 / * 1 < < ( C O L _ S H I F T - 1 ) * /
function f f _ s i m p l e _ i d c t _ a r m , e x p o r t =1
@@ void simple_idct_arm(int16_t *block)
@@ save stack for reg needed (take all of them),
@@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
@@ so it must not be overwritten, if it is not saved!!
@@ R12 is another scratch register, so it should not be saved too
@@ save all registers
stmfd s p ! , { r4 - r11 , r14 } @ R14 is also called LR
@@ at this point, R0=block, other registers are free.
add r14 , r0 , #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
@@ add 2 temporary variables in the stack: R0 and R14
sub s p , s p , #8 @ allow 2 local variables
str r0 , [ s p , #0 ] @ save block in sp[0]
@@ stack status
@@ sp+4 free
@@ sp+0 R0 (block)
@@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
__row_loop :
@@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
ldr r1 , [ r14 , #0 ] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
ldr r2 , [ r14 , #4 ] @ R2=(int32)(R12)[1]=ROWr32[1]
ldr r3 , [ r14 , #8 ] @ R3=ROWr32[2]
ldr r4 , [ r14 , #12 ] @ R4=ROWr32[3]
@@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
@@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
@@ else follow the complete algorithm.
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
orr r5 , r4 , r3 @ R5=R4 | R3
orr r5 , r5 , r2 @ R5=R4 | R3 | R2
orrs r6 , r5 , r1 @ Test R5 | R1 (the aim is to check if everything is null)
beq _ _ e n d _ r o w _ l o o p
mov r7 , r1 , a s r #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
ldrsh r6 , [ r14 , #0 ] @ R6=ROWr16[0]
orrs r5 , r5 , r7 @ R5=R4 | R3 | R2 | R7
beq _ _ a l m o s t _ e m p t y _ r o w
@@ __b_evaluation:
@@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
@@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
@@ R12=__const_ptr_, R14=&block[n]
@@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, row[1]);
@@ MUL16(b1, W3, row[1]);
@@ MUL16(b2, W5, row[1]);
@@ MUL16(b3, W7, row[1]);
@@ MAC16(b0, W3, row[3]);
@@ MAC16(b1, -W7, row[3]);
@@ MAC16(b2, -W1, row[3]);
@@ MAC16(b3, -W5, row[3]);
ldr r8 , =W1 @ R8=W1
mov r2 , r2 , a s r #16 @ R2=ROWr16[3]
mul r0 , r8 , r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9 , =W3 @ R9=W3
ldr r10 , =W5 @ R10=W5
mul r1 , r9 , r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11 , =W7 @ R11=W7
mul r5 , r10 , r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7 , r11 , r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2 , #0 @ if null avoid muls
itttt n e
mlane r0 , r9 , r2 , r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2 , r2 , #0 @ R2=-ROWr16[3]
mlane r1 , r11 , r2 , r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5 , r8 , r2 , r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it n e
mlane r7 , r10 , r2 , r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
orrs r2 , r3 , r4 @ R2=ROWr32[2] | ROWr32[3]
beq _ _ e n d _ b _ e v a l u a t i o n
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, row[5]);
@@ MAC16(b2, W7, row[5]);
@@ MAC16(b3, W3, row[5]);
@@ MAC16(b1, -W1, row[5]);
@@ MAC16(b0, W7, row[7]);
@@ MAC16(b2, W3, row[7]);
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3 , r3 , a s r #16 @ R3=ROWr16[5]
teq r3 , #0 @ if null avoid muls
it n e
mlane r0 , r10 , r3 , r0 @ R0+=W5*ROWr16[5]=b0
mov r4 , r4 , a s r #16 @ R4=ROWr16[7]
itttt n e
mlane r5 , r11 , r3 , r5 @ R5+=W7*ROWr16[5]=b2
mlane r7 , r9 , r3 , r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3 , r3 , #0 @ R3=-ROWr16[5]
mlane r1 , r8 , r3 , r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
teq r4 , #0 @ if null avoid muls
itttt n e
mlane r0 , r11 , r4 , r0 @ R0+=W7*ROWr16[7]=b0
mlane r5 , r9 , r4 , r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4 , r4 , #0 @ R4=-ROWr16[7]
mlane r7 , r8 , r4 , r7 @ R7-=W1*ROWr16[7]=b3
it n e
mlane r1 , r10 , r4 , r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation :
@@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
@@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation:
@@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldr r9 , =W4 @ R9=W4
mul r6 , r9 , r6 @ R6=W4*ROWr16[0]
ldr r10 , =W6 @ R10=W6
ldrsh r4 , [ r14 , #4 ] @ R4=ROWr16[2] (a3 not defined yet)
add r6 , r6 , #R O W _ S H I F T E D _ 1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
mul r11 , r10 , r4 @ R11=W6*ROWr16[2]
ldr r8 , =W2 @ R8=W2
sub r3 , r6 , r11 @ R3=a0-W6*ROWr16[2] (a2)
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
teq r2 , #0
beq _ _ e n d _ b e f _ a _ e v a l u a t i o n
add r2 , r6 , r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11 , r8 , r4 @ R11=W2*ROWr16[2]
sub r4 , r6 , r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6 , r6 , r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11 , [ r14 , #8 ] @ R11=ROWr16[4]
teq r11 , #0 @ if null avoid muls
it n e
mulne r11 , r9 , r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9 , [ r14 , #12 ] @ R9=ROWr16[6]
itttt n e
addne r6 , r6 , r11 @ R6+=W4*ROWr16[4] (a0)
subne r2 , r2 , r11 @ R2-=W4*ROWr16[4] (a1)
subne r3 , r3 , r11 @ R3-=W4*ROWr16[4] (a2)
addne r4 , r4 , r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9 , #0 @ if null avoid muls
itttt n e
mulne r11 , r10 , r9 @ R11=W6*ROWr16[6]
addne r6 , r6 , r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10 , r8 , r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4 , r4 , r11 @ R4-=W6*ROWr16[6] (a3)
itt n e
subne r2 , r2 , r10 @ R2-=W2*ROWr16[6] (a1)
addne r3 , r3 , r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation :
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ row[0] = (a0 + b0) >> ROW_SHIFT;
@@ row[1] = (a1 + b1) >> ROW_SHIFT;
@@ row[2] = (a2 + b2) >> ROW_SHIFT;
@@ row[3] = (a3 + b3) >> ROW_SHIFT;
@@ row[4] = (a3 - b3) >> ROW_SHIFT;
@@ row[5] = (a2 - b2) >> ROW_SHIFT;
@@ row[6] = (a1 - b1) >> ROW_SHIFT;
@@ row[7] = (a0 - b0) >> ROW_SHIFT;
add r8 , r6 , r0 @ R8=a0+b0
add r9 , r2 , r1 @ R9=a1+b1
@@ put two 16-bit half-words in a 32-bit word
@@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
ldr r10 , =MASK_MSHW @ R10=0xFFFF0000
and r9 , r10 , r9 , l s l #R O W _ S H I F T 2 M S H W @ R9=0xFFFF0000 & ((a1+b1)<<5)
mvn r11 , r10 @ R11= NOT R10= 0x0000FFFF
and r8 , r11 , r8 , a s r #R O W _ S H I F T @ R 8 =0x0000FFFF & ( ( a0 + b0 ) > > 1 1 )
orr r8 , r8 , r9
str r8 , [ r14 , #0 ]
add r8 , r3 , r5 @ R8=a2+b2
add r9 , r4 , r7 @ R9=a3+b3
and r9 , r10 , r9 , l s l #R O W _ S H I F T 2 M S H W @ R9=0xFFFF0000 & ((a3+b3)<<5)
and r8 , r11 , r8 , a s r #R O W _ S H I F T @ R 8 =0x0000FFFF & ( ( a2 + b2 ) > > 1 1 )
orr r8 , r8 , r9
str r8 , [ r14 , #4 ]
sub r8 , r4 , r7 @ R8=a3-b3
sub r9 , r3 , r5 @ R9=a2-b2
and r9 , r10 , r9 , l s l #R O W _ S H I F T 2 M S H W @ R9=0xFFFF0000 & ((a2-b2)<<5)
and r8 , r11 , r8 , a s r #R O W _ S H I F T @ R 8 =0x0000FFFF & ( ( a3 - b3 ) > > 1 1 )
orr r8 , r8 , r9
str r8 , [ r14 , #8 ]
sub r8 , r2 , r1 @ R8=a1-b1
sub r9 , r6 , r0 @ R9=a0-b0
and r9 , r10 , r9 , l s l #R O W _ S H I F T 2 M S H W @ R9=0xFFFF0000 & ((a0-b0)<<5)
and r8 , r11 , r8 , a s r #R O W _ S H I F T @ R 8 =0x0000FFFF & ( ( a1 - b1 ) > > 1 1 )
orr r8 , r8 , r9
str r8 , [ r14 , #12 ]
bal _ _ e n d _ r o w _ l o o p
__almost_empty_row :
@@ the row was empty, except ROWr16[0], now, management of this special case
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
@@ R8=0xFFFF (temp), R9-R11 free
mov r8 , #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
sub r8 , r8 , #1 @ R8 is now ready.
and r5 , r8 , r6 , l s l #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
orr r5 , r5 , r5 , l s l #16 @ R5=R5 | (R5<<16)
str r5 , [ r14 , #0 ] @ R14[0]=ROWr32[0]=R5
str r5 , [ r14 , #4 ] @ R14[4]=ROWr32[1]=R5
str r5 , [ r14 , #8 ] @ R14[8]=ROWr32[2]=R5
str r5 , [ r14 , #12 ] @ R14[12]=ROWr32[3]=R5
__end_row_loop :
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0 , [ s p , #0 ] @ R0=block
teq r0 , r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
sub r14 , r14 , #16
bne _ _ r o w _ l o o p
@@ at this point, R0=block, R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
add r14 , r0 , #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
__col_loop :
@@ __b_evaluation2:
@@ at this point, R0=block (temp), R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
@@ proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, col[8x1]);
@@ MUL16(b1, W3, col[8x1]);
@@ MUL16(b2, W5, col[8x1]);
@@ MUL16(b3, W7, col[8x1]);
@@ MAC16(b0, W3, col[8x3]);
@@ MAC16(b1, -W7, col[8x3]);
@@ MAC16(b2, -W1, col[8x3]);
@@ MAC16(b3, -W5, col[8x3]);
ldr r8 , =W1 @ R8=W1
ldrsh r7 , [ r14 , #16 ]
mul r0 , r8 , r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9 , =W3 @ R9=W3
ldr r10 , =W5 @ R10=W5
mul r1 , r9 , r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11 , =W7 @ R11=W7
mul r5 , r10 , r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldrsh r2 , [ r14 , #48 ]
mul r7 , r11 , r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2 , #0 @ if 0, then avoid muls
itttt n e
mlane r0 , r9 , r2 , r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2 , r2 , #0 @ R2=-ROWr16[3]
mlane r1 , r11 , r2 , r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5 , r8 , r2 , r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it n e
mlane r7 , r10 , r2 , r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, col[5x8]);
@@ MAC16(b2, W7, col[5x8]);
@@ MAC16(b3, W3, col[5x8]);
@@ MAC16(b1, -W1, col[5x8]);
@@ MAC16(b0, W7, col[7x8]);
@@ MAC16(b2, W3, col[7x8]);
@@ MAC16(b3, -W1, col[7x8]);
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3 , [ r14 , #80 ] @ R3=COLr16[5x8]
teq r3 , #0 @ if 0 then avoid muls
itttt n e
mlane r0 , r10 , r3 , r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5 , r11 , r3 , r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7 , r9 , r3 , r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3 , r3 , #0 @ R3=-ROWr16[5x8]
ldrsh r4 , [ r14 , #112 ] @ R4=COLr16[7x8]
it n e
mlane r1 , r8 , r3 , r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4 , #0 @ if 0 then avoid muls
itttt n e
mlane r0 , r11 , r4 , r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5 , r9 , r4 , r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4 , r4 , #0 @ R4=-ROWr16[7x8]
mlane r7 , r8 , r4 , r7 @ R7-=W1*ROWr16[7x8]=b3
it n e
mlane r1 , r10 , r4 , r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
@@ __end_b_evaluation2:
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation2:
@@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldrsh r6 , [ r14 , #0 ]
ldr r9 , =W4 @ R9=W4
mul r6 , r9 , r6 @ R6=W4*ROWr16[0]
ldr r10 , =W6 @ R10=W6
ldrsh r4 , [ r14 , #32 ] @ R4=ROWr16[2] (a3 not defined yet)
add r6 , r6 , #C O L _ S H I F T E D _ 1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
mul r11 , r10 , r4 @ R11=W6*ROWr16[2]
ldr r8 , =W2 @ R8=W2
add r2 , r6 , r11 @ R2=a0+W6*ROWr16[2] (a1)
sub r3 , r6 , r11 @ R3=a0-W6*ROWr16[2] (a2)
mul r11 , r8 , r4 @ R11=W2*ROWr16[2]
sub r4 , r6 , r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6 , r6 , r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11 , [ r14 , #64 ] @ R11=ROWr16[4]
teq r11 , #0 @ if null avoid muls
itttt n e
mulne r11 , r9 , r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6 , r6 , r11 @ R6+=W4*ROWr16[4] (a0)
subne r2 , r2 , r11 @ R2-=W4*ROWr16[4] (a1)
subne r3 , r3 , r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9 , [ r14 , #96 ] @ R9=ROWr16[6]
it n e
addne r4 , r4 , r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9 , #0 @ if null avoid muls
itttt n e
mulne r11 , r10 , r9 @ R11=W6*ROWr16[6]
addne r6 , r6 , r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10 , r8 , r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4 , r4 , r11 @ R4-=W6*ROWr16[6] (a3)
itt n e
subne r2 , r2 , r10 @ R2-=W2*ROWr16[6] (a1)
addne r3 , r3 , r10 @ R3+=W2*ROWr16[6] (a2)
@@ __end_a_evaluation2:
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
@@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
@@ col[16] = ((a2 + b2) >> COL_SHIFT);
@@ col[24] = ((a3 + b3) >> COL_SHIFT);
@@ col[32] = ((a3 - b3) >> COL_SHIFT);
@@ col[40] = ((a2 - b2) >> COL_SHIFT);
@@ col[48] = ((a1 - b1) >> COL_SHIFT);
@@ col[56] = ((a0 - b0) >> COL_SHIFT);
@@@@@ no optimization here @@@@@
add r8 , r6 , r0 @ R8=a0+b0
add r9 , r2 , r1 @ R9=a1+b1
mov r8 , r8 , a s r #C O L _ S H I F T
mov r9 , r9 , a s r #C O L _ S H I F T
strh r8 , [ r14 , #0 ]
strh r9 , [ r14 , #16 ]
add r8 , r3 , r5 @ R8=a2+b2
add r9 , r4 , r7 @ R9=a3+b3
mov r8 , r8 , a s r #C O L _ S H I F T
mov r9 , r9 , a s r #C O L _ S H I F T
strh r8 , [ r14 , #32 ]
strh r9 , [ r14 , #48 ]
sub r8 , r4 , r7 @ R8=a3-b3
sub r9 , r3 , r5 @ R9=a2-b2
mov r8 , r8 , a s r #C O L _ S H I F T
mov r9 , r9 , a s r #C O L _ S H I F T
strh r8 , [ r14 , #64 ]
strh r9 , [ r14 , #80 ]
sub r8 , r2 , r1 @ R8=a1-b1
sub r9 , r6 , r0 @ R9=a0-b0
mov r8 , r8 , a s r #C O L _ S H I F T
mov r9 , r9 , a s r #C O L _ S H I F T
strh r8 , [ r14 , #96 ]
strh r9 , [ r14 , #112 ]
@@ __end_col_loop:
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0 , [ s p , #0 ] @ R0=block
teq r0 , r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
sub r14 , r14 , #2
bne _ _ c o l _ l o o p
@@ __end_simple_idct_arm:
@@ restore registers to previous status!
add s p , s p , #8 @@ the local variables!
ldmfd s p ! , { r4 - r11 , r15 } @@ update PC with LR content.
@@ kind of sub-function, here not to overload the common case.
__end_bef_a_evaluation :
add r2 , r6 , r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11 , r8 , r4 @ R11=W2*ROWr16[2]
sub r4 , r6 , r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6 , r6 , r11 @ R6=a0+W2*ROWr16[2] (a0)
bal _ _ e n d _ a _ e v a l u a t i o n
endfunc