@ -133,23 +133,18 @@ SECTION .text
; %2 = rgb or bgr
% macro RGB24_TO_Y_FN 2-3
cglobal % 2 %+ 24 ToY , 6 , 6 , % 1 , ds t , src , u1 , u2 , w , table
% if mmsize == 8
mova m5 , [ % 2 _Ycoeff_12x4 ]
mova m6 , [ % 2 _Ycoeff_3x56 ]
% define coeff1 m5
% define coeff2 m6
% elif ARCH_X86_64
% if ARCH_X86_64
mova m8 , [ % 2 _Ycoeff_12x4 ]
mova m9 , [ % 2 _Ycoeff_3x56 ]
% define coeff1 m8
% define coeff2 m9
% else ; x86-32 && mmsize == 16
% else ; x86-32
% define coeff1 [%2_Ycoeff_12x4]
% define coeff2 [%2_Ycoeff_3x56]
% endif ; x86-32/64 && mmsize == 8/16
% if ( ARCH_X86_64 || mmsize == 8) && %0 == 3
% endif ; x86-32/64
% if ARCH_X86_64 && %0 == 3
jmp mangle ( private_prefix %+ _ %+ % 3 %+ 24 ToY %+ SUFFIX ) .body
% else ; ( ARCH_X86_64 && %0 == 3) || mmsize == 8
% else ; ARCH_X86_64 && %0 == 3
.body:
% if cpuflag(ssse3)
mova m7 , [ shuf_rgb_12x4 ]
@ -184,7 +179,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m1 , [ srcq + 2 ] ; (byte) { R0, B1, G1, R1 }
movd m2 , [ srcq + 6 ] ; (byte) { B2, G2, R2, B3 }
movd m3 , [ srcq + 8 ] ; (byte) { R2, B3, G3, R3 }
% if mmsize == 16 ; i.e. sse2
punpckldq m0 , m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1 , m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2 , [ srcq + 12 ] ; (byte) { B4, G4, R4, B5 }
@ -193,7 +187,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m6 , [ srcq + 20 ] ; (byte) { R6, B7, G7, R7 }
punpckldq m2 , m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3 , m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
% endif ; mmsize == 16
punpcklbw m0 , m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1 , m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2 , m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
@ -215,7 +208,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
add wq , mmsize
jl .loop
REP_RET
% endif ; ( ARCH_X86_64 && %0 == 3) || mmsize == 8
% endif ; ARCH_X86_64 && %0 == 3
% endmacro
; %1 = nr. of XMM registers
@ -275,12 +268,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
movd m1 , [ srcq + 2 ] ; (byte) { R0, B1, G1, R1 }
movd m4 , [ srcq + 6 ] ; (byte) { B2, G2, R2, B3 }
movd m5 , [ srcq + 8 ] ; (byte) { R2, B3, G3, R3 }
% if mmsize == 16
punpckldq m0 , m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1 , m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4 , [ srcq + 12 ] ; (byte) { B4, G4, R4, B5 }
movd m5 , [ srcq + 14 ] ; (byte) { R4, B5, G5, R5 }
% endif ; mmsize == 16
punpcklbw m0 , m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1 , m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
% endif ; cpuflag(ssse3)
@ -294,12 +285,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
pshufb m5 , m4 , shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4 , shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
% else ; !cpuflag(ssse3)
% if mmsize == 16
movd m1 , [ srcq + 18 ] ; (byte) { B6, G6, R6, B7 }
movd m3 , [ srcq + 20 ] ; (byte) { R6, B7, G7, R7 }
punpckldq m4 , m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5 , m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
% endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4 , m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5 , m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
% endif ; cpuflag(ssse3)
@ -320,13 +309,8 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m4 , 9
packssdw m0 , m1 ; (word) { U[0-7] }
packssdw m2 , m4 ; (word) { V[0-7] }
% if mmsize == 8
mova [ ds tUq + wq ], m0
mova [ ds tVq + wq ], m2
% else ; mmsize == 16
mova [ ds tUq + wq ], m0
mova [ ds tVq + wq ], m2
% endif ; mmsize == 8/16
add wq , mmsize
jl .loop
REP_RET
@ -342,11 +326,6 @@ RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN % 2 , bgr , rgb
% endmacro
% if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0 , 0
% endif
INIT_XMM ss e2
RGB24_FUNCS 10 , 12
@ -483,13 +462,8 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m1 , 9
packssdw m0 , m4 ; (word) { U[0-7] }
packssdw m2 , m1 ; (word) { V[0-7] }
% if mmsize == 8
mova [ ds tUq + wq ], m0
mova [ ds tVq + wq ], m2
% else ; mmsize == 16
mova [ ds tUq + wq ], m0
mova [ ds tVq + wq ], m2
% endif ; mmsize == 8/16
add wq , mmsize
jl .loop
sub wq , mmsize - 1
@ -535,11 +509,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN % 2 , a , b , g , r , rgba
% endmacro
% if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0 , 0
% endif
INIT_XMM ss e2
RGB32_FUNCS 8 , 12
@ -588,25 +557,18 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
movsxd wq , wd
% endif
add ds tq , wq
% if mmsize == 16
test srcq , 15
% endif
lea srcq , [ srcq + wq * 2 ]
% ifidn %2, yuyv
pcmpeqb m2 , m2 ; (byte) { 0xff } x 16
psrlw m2 , 8 ; (word) { 0x00ff } x 8
% endif ; yuyv
% if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a , % 2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u , % 2
% else ; mmsize == 8
neg wq
LOOP_YUYV_TO_Y a , % 2
% endif ; mmsize == 8/16
% endmacro
; %1 = a (aligned) or u (unaligned)
@ -632,16 +594,9 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
packuswb m0 , m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1 , m0 , m2 ; (word) { U0, U1, ..., U7 }
psrlw m0 , 8 ; (word) { V0, V1, ..., V7 }
% if mmsize == 16
packuswb m1 , m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [ ds tUq + wq ], m1
movhps [ ds tVq + wq ], m1
% else ; mmsize == 8
packuswb m1 , m1 ; (byte) { U0, ... U3 }
packuswb m0 , m0 ; (byte) { V0, ... V3 }
movh [ ds tUq + wq ], m1
movh [ ds tVq + wq ], m0
% endif ; mmsize == 8/16
add wq , mmsize / 2
jl .loop_ % 1
REP_RET
@ -661,24 +616,24 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
% endif
add ds tUq , wq
add ds tVq , wq
% if mmsize == 16 && %0 == 2
% if %0 == 2
test srcq , 15
% endif
lea srcq , [ srcq + wq * 4 ]
pcmpeqb m2 , m2 ; (byte) { 0xff } x 16
psrlw m2 , 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
% if mmsize == 16 && %0 == 2
% if %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a , % 2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u , % 2
% else ; mmsize == 8
% else
neg wq
LOOP_YUYV_TO_UV a , % 2
% endif ; mmsize == 8/16
% endif
% endmacro
; %1 = a (aligned) or u (unaligned)
@ -716,35 +671,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
% endif
add ds tUq , wq
add ds tVq , wq
% if mmsize == 16
test srcq , 15
% endif
lea srcq , [ srcq + wq * 2 ]
pcmpeqb m5 , m5 ; (byte) { 0xff } x 16
psrlw m5 , 8 ; (word) { 0x00ff } x 8
% if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a , % 2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u , % 2
% else ; mmsize == 8
neg wq
LOOP_NVXX_TO_UV a , % 2
% endif ; mmsize == 8/16
% endmacro
% if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0 , yuyv
YUYV_TO_Y_FN 0 , uyvy
YUYV_TO_UV_FN 0 , yuyv
YUYV_TO_UV_FN 0 , uyvy
NVXX_TO_UV_FN 0 , nv12
NVXX_TO_UV_FN 0 , nv21
% endif
INIT_XMM ss e2
YUYV_TO_Y_FN 3 , yuyv
YUYV_TO_Y_FN 2 , uyvy