|
|
|
@ -32,13 +32,14 @@ SECTION .text |
|
|
|
|
|
|
|
|
|
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 |
|
|
|
|
%macro RV40_WCORE 4-5 |
|
|
|
|
movh m4, [%3 + 0] |
|
|
|
|
movh m5, [%4 + 0] |
|
|
|
|
movh m4, [%3 + r6 + 0] |
|
|
|
|
movh m5, [%4 + r6 + 0] |
|
|
|
|
%if %0 == 4 |
|
|
|
|
%define OFFSET mmsize / 2 |
|
|
|
|
%define OFFSET r6 + mmsize / 2 |
|
|
|
|
%else |
|
|
|
|
; 8x8 block and sse2, stride was provided |
|
|
|
|
%define OFFSET %5 |
|
|
|
|
%define OFFSET r6 |
|
|
|
|
add r6, r5 |
|
|
|
|
%endif |
|
|
|
|
movh m6, [%3 + OFFSET] |
|
|
|
|
movh m7, [%4 + OFFSET] |
|
|
|
@ -99,10 +100,12 @@ SECTION .text |
|
|
|
|
packuswb m4, m6 |
|
|
|
|
%if %0 == 5 |
|
|
|
|
; Only called for 8x8 blocks and sse2 |
|
|
|
|
movh [%2 + 0], m4 |
|
|
|
|
movhps [%2 + %5], m4 |
|
|
|
|
sub r6, r5 |
|
|
|
|
movh [%2 + r6], m4 |
|
|
|
|
add r6, r5 |
|
|
|
|
movhps [%2 + r6], m4 |
|
|
|
|
%else |
|
|
|
|
mova [%2], m4 |
|
|
|
|
mova [%2 + r6], m4 |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
@ -115,26 +118,19 @@ SECTION .text |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
; Prepare for next loop |
|
|
|
|
add r0, r5 |
|
|
|
|
add r1, r5 |
|
|
|
|
add r2, r5 |
|
|
|
|
add r6, r5 |
|
|
|
|
%else |
|
|
|
|
%ifidn %1, 8 |
|
|
|
|
RV40_WCORE %2, r0, r1, r2, r5 |
|
|
|
|
; Prepare 2 next lines |
|
|
|
|
lea r0, [r0 + 2 * r5] |
|
|
|
|
lea r1, [r1 + 2 * r5] |
|
|
|
|
lea r2, [r2 + 2 * r5] |
|
|
|
|
add r6, r5 |
|
|
|
|
%else |
|
|
|
|
RV40_WCORE %2, r0, r1, r2 |
|
|
|
|
; Prepare single next line |
|
|
|
|
add r0, r5 |
|
|
|
|
add r1, r5 |
|
|
|
|
add r2, r5 |
|
|
|
|
add r6, r5 |
|
|
|
|
%endif |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
dec r6 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) |
|
|
|
@ -145,7 +141,7 @@ SECTION .text |
|
|
|
|
; Therefore, we check here whether they are multiples of 2^9 for |
|
|
|
|
; those simplifications to occur. |
|
|
|
|
%macro RV40_WEIGHT 3 |
|
|
|
|
cglobal rv40_weight_func_%1_%2, 6, 7, %3 |
|
|
|
|
cglobal rv40_weight_func_%1_%2, 6, 7, 8 |
|
|
|
|
%if cpuflag(ssse3) |
|
|
|
|
mova m1, [shift_round] |
|
|
|
|
%else |
|
|
|
@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3 |
|
|
|
|
%endif |
|
|
|
|
pxor m0, m0 |
|
|
|
|
; Set loop counter and increments |
|
|
|
|
%if mmsize == 8 |
|
|
|
|
mov r6, %2 |
|
|
|
|
%else |
|
|
|
|
mov r6, (%2 * %2) / mmsize |
|
|
|
|
%endif |
|
|
|
|
mov r6, r5 |
|
|
|
|
shl r6, %3 |
|
|
|
|
add r0, r6 |
|
|
|
|
add r1, r6 |
|
|
|
|
add r2, r6 |
|
|
|
|
neg r6 |
|
|
|
|
|
|
|
|
|
movd m2, r3 |
|
|
|
|
movd m3, r4 |
|
|
|
|