@ -25,11 +25,18 @@
SECTION _RODATA
pw_00ff: times 4 dw 255
pb_f8: times 8 db 248
pb_e0: times 8 db 224
pb_03: times 8 db 3
pb_07: times 8 db 7
; below variables are named like mask_dwXY, which means to preserve dword No.X & No.Y
mask_dw036 : db - 1 , - 1 , 0 , 0 , 0 , 0 , - 1 , - 1 , 0 , 0 , 0 , 0 , - 1 , - 1 , 0 , 0
mask_dw147 : db 0 , 0 , - 1 , - 1 , 0 , 0 , 0 , 0 , - 1 , - 1 , 0 , 0 , 0 , 0 , - 1 , - 1
mask_dw25 : db 0 , 0 , 0 , 0 , - 1 , - 1 , 0 , 0 , 0 , 0 , - 1 , - 1 , 0 , 0 , 0 , 0
rgb24_shuf1: db 0 , 1 , 6 , 7 , 12 , 13 , 2 , 3 , 8 , 9 , 14 , 15 , 4 , 5 , 10 , 11
rgb24_shuf2: db 10 , 11 , 0 , 1 , 6 , 7 , 12 , 13 , 2 , 3 , 8 , 9 , 14 , 15 , 4 , 5
rgb24_shuf3: db 4 , 5 , 10 , 11 , 0 , 1 , 6 , 7 , 12 , 13 , 2 , 3 , 8 , 9 , 14 , 15
pw_00ff: times 8 dw 255
pb_f8: times 16 db 248
pb_e0: times 16 db 224
pb_03: times 16 db 3
pb_07: times 16 db 7
mask_1101: dw - 1 , - 1 , 0 , - 1
mask_0010: dw 0 , 0 , - 1 , 0
@ -49,7 +56,11 @@ SECTION .text
;-----------------------------------------------------------------------------
% macro MOV_H2L 1
psrlq % 1 , 32
% if mmsize == 8
psrlq % 1 , 32
% else ; mmsize == 16
psrldq % 1 , 8
% endif
% endmacro
% macro yuv2rgb_fn 3
@ -77,6 +88,7 @@ psrlq %1, 32
% define m_blue m1
% endif
% if mmsize == 8
% define time_num 1
% define reg_num 8
% define y_offset [pointer_c_ditherq + 8 * 8]
@ -87,11 +99,45 @@ psrlq %1, 32
% define y_coff [pointer_c_ditherq + 3 * 8]
% define ub_coff [pointer_c_ditherq + 5 * 8]
% define vr_coff [pointer_c_ditherq + 4 * 8]
% elif mmsize == 16
% define time_num 2
% if ARCH_X86_32
% define reg_num 8
% define my_offset [pointer_c_ditherq + 8 * 8]
% define mu_offset [pointer_c_ditherq + 9 * 8]
% define mv_offset [pointer_c_ditherq + 10 * 8]
% define mug_coff [pointer_c_ditherq + 7 * 8]
% define mvg_coff [pointer_c_ditherq + 6 * 8]
% define my_coff [pointer_c_ditherq + 3 * 8]
% define mub_coff [pointer_c_ditherq + 5 * 8]
% define mvr_coff [pointer_c_ditherq + 4 * 8]
% else ; ARCH_X86_64
% define reg_num 16
% define y_offset m8
% define u_offset m9
% define v_offset m10
% define ug_coff m11
% define vg_coff m12
% define y_coff m13
% define ub_coff m14
% define vr_coff m15
% endif ; ARCH_X86_32/64
% endif ; coeff define mmsize == 8/16
cglobal % 1 _420_ % 2 % 3 , GPR_num , GPR_num , reg_num , parameters
% if ARCH_X86_64
movsxd indexq , indexd
% if mmsize == 16
VBROADCASTSD y_offset , [ pointer_c_ditherq + 8 * 8 ]
VBROADCASTSD u_offset , [ pointer_c_ditherq + 9 * 8 ]
VBROADCASTSD v_offset , [ pointer_c_ditherq + 10 * 8 ]
VBROADCASTSD ug_coff , [ pointer_c_ditherq + 7 * 8 ]
VBROADCASTSD vg_coff , [ pointer_c_ditherq + 6 * 8 ]
VBROADCASTSD y_coff , [ pointer_c_ditherq + 3 * 8 ]
VBROADCASTSD ub_coff , [ pointer_c_ditherq + 5 * 8 ]
VBROADCASTSD vr_coff , [ pointer_c_ditherq + 4 * 8 ]
% endif
% endif
mova m_y , [ py_2indexq + 2 * indexq ]
movh m_u , [ pu_indexq + indexq ]
@ -108,10 +154,30 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
psllw m1 , 3
psllw m6 , 3
psllw m7 , 3
% if (ARCH_X86_32 && mmsize == 16)
VBROADCASTSD m2 , mu_offset
VBROADCASTSD m3 , mv_offset
VBROADCASTSD m4 , my_offset
psubsw m0 , m2 ; U = U - 128
psubsw m1 , m3 ; V = V - 128
psubw m6 , m4
psubw m7 , m4
VBROADCASTSD m2 , mug_coff
VBROADCASTSD m3 , mvg_coff
VBROADCASTSD m4 , my_coff
VBROADCASTSD m5 , mub_coff
pmulhw m2 , m0
pmulhw m3 , m1
pmulhw m6 , m4
pmulhw m7 , m4
pmulhw m0 , m5
VBROADCASTSD m4 , mvr_coff
pmulhw m1 , m4
% else ; ARCH_X86_64 || mmsize == 8
psubsw m0 , u_offset ; U = U - 128
psubsw m1 , v_offset ; V = V - 128
psubw m6 , y_offset
psubw m7 , y_offset
psubw m6 , y_offset
psubw m7 , y_offset
mova m2 , m0
mova m3 , m1
pmulhw m2 , ug_coff
@ -120,6 +186,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
pmulhw m7 , y_coff
pmulhw m0 , ub_coff
pmulhw m1 , vr_coff
% endif
paddsw m2 , m3
mova m3 , m7
mova m5 , m7
@ -142,6 +209,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
punpcklbw m6 , m_red ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
mova m5 , m3
punpckhbw m2 , m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
% if mmsize == 8
punpcklwd m3 , m6 ; R0 G0 B0 R1 R2 G2 B2 R3
punpckhwd m5 , m6 ; R4 G4 B4 R5 R6 G6 B6 R7
% if cpuflag(mmxext)
@ -177,7 +245,33 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
psrlq m5 , 32
movd [ imageq + 20 ], m2 ; -- -- G7 B7
movd [ imageq + 18 ], m5 ; R6 G6 B6 R7
% endif
% endif ; mmsize = 8
% else ; mmsize == 16
pshufb m3 , [ rgb24_shuf1 ] ; r0 g0 r6 g6 r12 g12 r2 g2 r8 g8 r14 g14 r4 g4 r10 g10
pshufb m6 , [ rgb24_shuf2 ] ; b10 r11 b0 r1 b6 r7 b12 r13 b2 r3 b8 r9 b14 r15 b4 r5
pshufb m2 , [ rgb24_shuf3 ] ; g5 b5 g11 b11 g1 b1 g7 b7 g13 b13 g3 b3 g9 b9 g15 b15
mova m7 , [ mask_dw036 ]
mova m4 , [ mask_dw147 ]
mova m5 , [ mask_dw25 ]
pand m0 , m7 , m3 ; r0 g0 --- --- --- --- r2 g2 --- --- --- --- r4 g4 --- ---
pand m1 , m4 , m6 ; --- --- b0 r1 --- --- --- --- b2 r3 --- --- --- --- b4 r5
por m0 , m1
pand m1 , m5 , m2 ; --- --- --- --- g1 b1 --- --- --- --- g3 b3 --- --- --- ---
por m0 , m1 ; r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5
pand m1 , m7 , m2 ; g5 b5 --- --- --- --- g7 b7 --- --- --- --- g9 b9 --- ---
pand m7 , m6 ; b10 r11 --- --- --- --- b12 r13 --- --- --- --- b14 r15 --- ---
pand m6 , m5 ; --- --- --- --- b6 r7 --- --- --- --- b8 r9 --- --- --- ---
por m1 , m6
pand m6 , m4 , m3 ; --- --- r6 g6 --- --- --- --- r8 g8 --- --- --- --- r10 g10
pand m2 , m4 ; --- --- g11 b11 --- --- --- --- g13 b13 --- --- --- --- g15 b15
pand m3 , m5 ; --- --- --- --- r12 g12 --- --- --- --- r14 g14 --- --- --- ---
por m2 , m7
por m1 , m6 ; g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 g9 b9 r10 g10
por m2 , m3 ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15
mova [ imageq ], m0
mova [ imageq + 16 ], m1
mova [ imageq + 32 ], m2
% endif ; mmsize = 16
% else ; PACK RGB15/16/32
packuswb m0 , m1
packuswb m3 , m5
@ -196,10 +290,10 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
% endif
mova m5 , m_blue
mova m6 , m_red
punpckhbw m5 , m_green
punpckhbw m5 , m_green
punpcklbw m_blue , m_green
punpckhbw m6 , m_alpha
punpcklbw m_red , m_alpha
punpckhbw m6 , m_alpha
punpcklbw m_red , m_alpha
mova m_green , m_blue
mova m_alpha , m5
punpcklwd m_blue , m_red
@ -207,14 +301,23 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
punpcklwd m5 , m6
punpckhwd m_alpha , m6
mova [ imageq + 0 ], m_blue
mova [ imageq + 8 * time_num ], m_green
mova [ imageq + 8 * time_num ], m_green
mova [ imageq + 16 * time_num ], m5
mova [ imageq + 24 * time_num ], m_alpha
% else ; PACK RGB15/16
% define depth 2
% define blue_dither [pointer_c_ditherq + 2 * 8]
% define green_dither [pointer_c_ditherq + 1 * 8]
% define red_dither [pointer_c_ditherq + 0 * 8]
% if cpuflag(ssse3)
% define red_dither m3
% define green_dither m4
% define blue_dither m5
VBROADCASTSD red_dither , [ pointer_c_ditherq + 0 * 8 ]
VBROADCASTSD green_dither , [ pointer_c_ditherq + 1 * 8 ]
VBROADCASTSD bl ue_dither , [ pointer_c_ditherq + 2 * 8 ]
% else ; cpuflag(mmx/mmxext)
% define blue_dither [pointer_c_ditherq + 2 * 8]
% define green_dither [pointer_c_ditherq + 1 * 8]
% define red_dither [pointer_c_ditherq + 0 * 8]
% endif
% if %3 == 15
% define gmask pb_03
% define isRGB15 1
@ -268,3 +371,13 @@ yuv2rgb_fn yuv, rgb, 16
INIT_MMX mmxext
yuv2rgb_fn yuv , rgb , 24
yuv2rgb_fn yuv , bgr , 24
INIT_XMM ss se3
yuv2rgb_fn yuv , rgb , 24
yuv2rgb_fn yuv , bgr , 24
yuv2rgb_fn yuv , rgb , 32
yuv2rgb_fn yuv , bgr , 32
yuv2rgb_fn yuva , rgb , 32
yuv2rgb_fn yuva , bgr , 32
yuv2rgb_fn yuv , rgb , 15
yuv2rgb_fn yuv , rgb , 16