@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db - 1 , - 1 , 1 , 1 , - 1 , - 1 , 5 , 5 , - 1 , - 1 , 9 , 9 , - 1 , - 1 , 13 , 13
pb_zz11zz55zz99zzdd: db - 1 , - 1 , 1 , 1 , - 1 , - 1 , 5 , 5 , - 1 , - 1 , 9 , 9 , - 1 , - 1 , 13 , 13
pb_revwords: db 14 , 15 , 12 , 13 , 10 , 11 , 8 , 9 , 6 , 7 , 4 , 5 , 2 , 3 , 0 , 1
pb_revwords: db 14 , 15 , 12 , 13 , 10 , 11 , 8 , 9 , 6 , 7 , 4 , 5 , 2 , 3 , 0 , 1
pd_16384: times 4 dd 16384
pd_16384: times 4 dd 16384
pb_bswap32: db 3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 , 11 , 10 , 9 , 8 , 15 , 14 , 13 , 12
SECTION _TEXT
SECTION _TEXT
@ -1178,3 +1179,125 @@ INIT_XMM sse
BUTTERFLIES_FLOAT_INTERLEAVE
BUTTERFLIES_FLOAT_INTERLEAVE
INIT_YMM avx
INIT_YMM avx
BUTTERFLIES_FLOAT_INTERLEAVE
BUTTERFLIES_FLOAT_INTERLEAVE
INIT_XMM ss e2
; %1 = aligned/unaligned
% macro BSWAP_LOOPS_SSE2 1
mov r3 , r2
sar r2 , 3
jz .left4_ % 1
.loop8_ % 1 :
mov % 1 m0 , [ r1 + 0 ]
mov % 1 m1 , [ r1 + 16 ]
pshuflw m0 , m0 , 10110001b
pshuflw m1 , m1 , 10110001b
pshufhw m0 , m0 , 10110001b
pshufhw m1 , m1 , 10110001b
mova m2 , m0
mova m3 , m1
psllw m0 , 8
psllw m1 , 8
psrlw m2 , 8
psrlw m3 , 8
por m2 , m0
por m3 , m1
mova [ r0 + 0 ], m2
mova [ r0 + 16 ], m3
add r1 , 32
add r0 , 32
dec r2
jnz .loop8_ % 1
.left4_ % 1 :
mov r2 , r3
and r3 , 4
jz .left
mov % 1 m0 , [ r1 ]
pshuflw m0 , m0 , 10110001b
pshufhw m0 , m0 , 10110001b
mova m2 , m0
psllw m0 , 8
psrlw m2 , 8
por m2 , m0
mova [ r0 ], m2
add r1 , 16
add r0 , 16
% endmacro
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
cglobal bswap32_buf , 3 , 4 , 5
mov r3 , r1
and r3 , 15
jz .start_align
BSWAP_LOOPS_SSE2 u
jmp .left
.start_align:
BSWAP_LOOPS_SSE2 a
.left:
and r2 , 3
jz .end
.loop2:
mov r3d , [ r1 ]
bswap r3d
mov [ r0 ], r3d
add r1 , 4
add r0 , 4
dec r2
jnz .loop2
.end
RET
; %1 = aligned/unaligned
% macro BSWAP_LOOPS_SSSE3 1
mov r3 , r2
sar r2 , 3
jz .left4_ % 1
.loop8_ % 1 :
mov % 1 m0 , [ r1 + 0 ]
mov % 1 m1 , [ r1 + 16 ]
pshufb m0 , m2
pshufb m1 , m2
mova [ r0 + 0 ], m0
mova [ r0 + 16 ], m1
add r0 , 32
add r1 , 32
dec r2
jnz .loop8_ % 1
.left4_ % 1 :
mov r2 , r3
and r3 , 4
jz .left2
mov % 1 m0 , [ r1 ]
pshufb m0 , m2
mova [ r0 ], m0
add r1 , 16
add r0 , 16
% endmacro
INIT_XMM ss se3
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
cglobal bswap32_buf , 3 , 4 , 3
mov r3 , r1
mova m2 , [ pb_bswap32 ]
and r3 , 15
jz .start_align
BSWAP_LOOPS_SSSE3 u
jmp .left2
.start_align:
BSWAP_LOOPS_SSSE3 a
.left2:
mov r3 , r2
and r2 , 2
jz .left1
movq m0 , [ r1 ]
pshufb m0 , m2
movq [ r0 ], m0
add r1 , 8
add r0 , 8
.left1:
and r3 , 1
jz .end
mov r2d , [ r1 ]
bswap r2d
mov [ r0 ], r2d
.end:
RET