@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15
pb_shuffle2130: db 2 , 1 , 3 , 0 , 6 , 5 , 7 , 4 , 10 , 9 , 11 , 8 , 14 , 13 , 15 , 12
pb_shuffle2130: db 2 , 1 , 3 , 0 , 6 , 5 , 7 , 4 , 10 , 9 , 11 , 8 , 14 , 13 , 15 , 12
pb_shuffle1203: db 1 , 2 , 0 , 3 , 5 , 6 , 4 , 7 , 9 , 10 , 8 , 11 , 13 , 14 , 12 , 15
pb_shuffle1203: db 1 , 2 , 0 , 3 , 5 , 6 , 4 , 7 , 9 , 10 , 8 , 11 , 13 , 14 , 12 , 15
% if HAVE_AVX512ICL_EXTERNAL
; shuffle vector to rearrange packuswb result to be linear
shuf_packus: db 0 , 1 , 2 , 3 , 16 , 17 , 18 , 19 , 32 , 33 , 34 , 35 , 48 , 49 , 50 , 51 , \
4 , 5 , 6 , 7 , 2 0 , 2 1 , 2 2 , 2 3 , 3 6 , 3 7 , 3 8 , 3 9 , 5 2 , 5 3 , 5 4 , 5 5 , \
8 , 9 , 1 0 , 1 1 , 2 4 , 2 5 , 2 6 , 2 7 , 4 0 , 4 1 , 4 2 , 4 3 , 5 6 , 5 7 , 5 8 , 5 9 , \
1 2 , 1 3 , 1 4 , 1 5 , 2 8 , 2 9 , 3 0 , 3 1 , 4 4 , 4 5 , 4 6 , 4 7 , 6 0 , 6 1 , 6 2 , 6 3
; shuffle vector to combine odd elements from two vectors to extract Y
shuf_perm2b: db 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 17 , 19 , 21 , 23 , 25 , 27 , 29 , 31 , \
3 3 , 3 5 , 3 7 , 3 9 , 4 1 , 4 3 , 4 5 , 4 7 , 4 9 , 5 1 , 5 3 , 5 5 , 5 7 , 5 9 , 6 1 , 6 3 , \
6 5 , 6 7 , 6 9 , 7 1 , 7 3 , 7 5 , 7 7 , 7 9 , 8 1 , 8 3 , 8 5 , 8 7 , 8 9 , 9 1 , 9 3 , 9 5 , \
9 7 , 9 9 , 1 0 1 , 1 0 3 , 1 0 5 , 1 0 7 , 1 0 9 , 1 1 1 , 1 1 3 , 1 1 5 , 1 1 7 , 1 1 9 , 1 2 1 , 1 2 3 , 1 2 5 , 1 2 7
% endif
SECTION .text
SECTION .text
% macro RSHIFT_COPY 5
% macro RSHIFT_COPY 5
@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3
; int lumStride, int chromStride, int srcStride)
; int lumStride, int chromStride, int srcStride)
;-----------------------------------------------------------------------------------------------
;-----------------------------------------------------------------------------------------------
% macro UYVY_TO_YUV422 0
% macro UYVY_TO_YUV422 0
% if mmsize == 64
; need two more registers to store shuffle vectors for AVX512ICL
cglobal uyvytoyuv422 , 9 , 14 , 10 , ydst , udst , vdst , src , w , h , lum_stride , ch rom_stride , src_stride , wtwo , whalf , tmp , x , back_w
% else
cglobal uyvytoyuv422 , 9 , 14 , 8 , ydst , udst , vdst , src , w , h , lum_stride , ch rom_stride , src_stride , wtwo , whalf , tmp , x , back_w
cglobal uyvytoyuv422 , 9 , 14 , 8 , ydst , udst , vdst , src , w , h , lum_stride , ch rom_stride , src_stride , wtwo , whalf , tmp , x , back_w
% endif
pxor m0 , m0
pxor m0 , m0
% if mmsize == 64
vpternlogd m1 , m1 , m1 , 0xff ; m1 = _mm512_set1_epi8(0xff)
movu m8 , [ shuf_packus ]
movu m9 , [ shuf_perm2b ]
% else
pcmpeqw m1 , m1
pcmpeqw m1 , m1
% endif
psrlw m1 , 8
psrlw m1 , 8
movsxdifnidn wq , wd
movsxdifnidn wq , wd
@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
and xq , mmsize * 2 - 1
and xq , mmsize * 2 - 1
je .loop_simd
je .loop_simd
% if mmsize == 64
shr xq , 1
mov tmpq , - 1
shlx tmpq , tmpq , xq
not tmpq
kmovq k7 , tmpq ; write mask for U/V
kmovd k1 , tmpd ; write mask for 1st half of Y
kmovw k3 , tmpd ; read mask for 1st vector
shr tmpq , 16
kmovw k4 , tmpd ; read mask for 2nd vector
shr tmpq , 16
kmovd k2 , tmpd ; write mask for 2nd half of Y
kmovw k5 , tmpd ; read mask for 3rd vector
shr tmpd , 16
kmovw k6 , tmpd ; read mask for 4th vector
vmovdqu32 m2 { k3 } { z } , [ srcq + wtwoq ]
vmovdqu32 m3 { k4 } { z } , [ srcq + wtwoq + mmsize ]
vmovdqu32 m4 { k5 } { z } , [ srcq + wtwoq + mmsize * 2 ]
vmovdqu32 m5 { k6 } { z } , [ srcq + wtwoq + mmsize * 3 ]
; extract y part 1
mova m6 , m9
vpermi2b m6 , m2 , m3 ; UYVY UYVY -> YYYY using permute
vmovdqu16 [ ydstq + wq ] { k1 } , m6
; extract y part 2
mova m7 , m9
vpermi2b m7 , m4 , m5 ; UYVY UYVY -> YYYY using permute
vmovdqu16 [ ydstq + wq + mmsize ] { k2 } , m7
; extract uv
pand m2 , m1 ; UxVx...
pand m3 , m1 ; UxVx...
pand m4 , m1 ; UxVx...
pand m5 , m1 ; UxVx...
packuswb m2 , m3 ; UVUV...
packuswb m4 , m5 ; UVUV...
; U
pand m6 , m2 , m1 ; UxUx...
pand m7 , m4 , m1 ; UxUx...
packuswb m6 , m7 ; UUUU
vpermb m6 , m8 , m6
vmovdqu8 [ udstq + whalfq ] { k7 } , m6
; V
psrlw m2 , 8 ; VxVx...
psrlw m4 , 8 ; VxVx...
packuswb m2 , m4 ; VVVV
vpermb m2 , m8 , m2
vmovdqu8 [ vdstq + whalfq ] { k7 } , m2
lea wq , [ wq + 2 * xq ]
lea wtwoq , [ wtwoq + 4 * xq ]
add whalfq , xq
% else
.loop_scalar:
.loop_scalar:
mov tmpb , [ srcq + wtwoq + 0 ]
mov tmpb , [ srcq + wtwoq + 0 ]
mov [ udstq + whalfq ], tmpb
mov [ udstq + whalfq ], tmpb
@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
add whalfq , 1
add whalfq , 1
sub xq , 2
sub xq , 2
jg .loop_scalar
jg .loop_scalar
% endif
; check if simd loop is need
; check if simd loop is need
cmp wq , 0
cmp wq , 0
@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
movu m5 , [ srcq + wtwoq + mmsize * 3 ]
movu m5 , [ srcq + wtwoq + mmsize * 3 ]
% endif
% endif
% if mmsize == 64
; extract y part 1
mova m6 , m9
vpermi2b m6 , m2 , m3 ; UYVY UYVY -> YYYY using permute
movu [ ydstq + wq ], m6
; extract y part 2
mova m7 , m9
vpermi2b m7 , m4 , m5 ; UYVY UYVY -> YYYY using permute
movu [ ydstq + wq + mmsize ], m7
% else
; extract y part 1
; extract y part 1
RSHIFT_COPY m6 , m2 , m4 , 1 , 0x20 ; UYVY UYVY -> YVYU YVY...
RSHIFT_COPY m6 , m2 , m4 , 1 , 0x20 ; UYVY UYVY -> YVYU YVY...
pand m6 , m1 ; YxYx YxYx...
pand m6 , m1 ; YxYx YxYx...
@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6 , m7 ; YYYY YYYY...
packuswb m6 , m7 ; YYYY YYYY...
movu [ ydstq + wq + mmsize ], m6
movu [ ydstq + wq + mmsize ], m6
% endif
; extract uv
; extract uv
pand m2 , m1 ; UxVx...
pand m2 , m1 ; UxVx...
@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
pand m7 , m4 , m1 ; UxUx...
pand m7 , m4 , m1 ; UxUx...
packuswb m6 , m7 ; UUUU
packuswb m6 , m7 ; UUUU
% if mmsize == 64
vpermb m6 , m8 , m6
% endif
movu [ udstq + whalfq ], m6
movu [ udstq + whalfq ], m6
@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
psrlw m2 , 8 ; VxVx...
psrlw m2 , 8 ; VxVx...
psrlw m4 , 8 ; VxVx...
psrlw m4 , 8 ; VxVx...
packuswb m2 , m4 ; VVVV
packuswb m2 , m4 ; VVVV
% if mmsize == 64
vpermb m2 , m8 , m2
% endif
movu [ vdstq + whalfq ], m2
movu [ vdstq + whalfq ], m2
add whalfq , mmsize
add whalfq , mmsize
@ -303,4 +404,8 @@ UYVY_TO_YUV422
INIT_YMM avx2
INIT_YMM avx2
UYVY_TO_YUV422
UYVY_TO_YUV422
% endif
% endif
% if HAVE_AVX512ICL_EXTERNAL
INIT_ZMM avx512icl
UYVY_TO_YUV422
% endif
% endif
% endif