|
|
|
@ -303,24 +303,30 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h |
|
|
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
%if ARCH_X86_64 == 1 |
|
|
|
|
INIT_XMM sse4 |
|
|
|
|
; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) |
|
|
|
|
cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h |
|
|
|
|
mov r6, srcq |
|
|
|
|
mov r7, dstq |
|
|
|
|
mov r8, wq |
|
|
|
|
%if ARCH_X86_64 |
|
|
|
|
cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2 |
|
|
|
|
%else |
|
|
|
|
cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2 |
|
|
|
|
%define hd r5mp |
|
|
|
|
%endif |
|
|
|
|
shl wd, 2 |
|
|
|
|
add srcq, wq |
|
|
|
|
neg wq |
|
|
|
|
mov t2q, dstq |
|
|
|
|
mov t1q, wq |
|
|
|
|
pxor m2, m2 |
|
|
|
|
mova m3, [clip_10bit] |
|
|
|
|
mova m4, [convert_to_unsigned_10bit] |
|
|
|
|
|
|
|
|
|
.loop_h: |
|
|
|
|
mov srcq, r6 |
|
|
|
|
mov dstq, r7 |
|
|
|
|
mov wq, r8 |
|
|
|
|
mov dstq, t2q |
|
|
|
|
mov wq, t1q |
|
|
|
|
|
|
|
|
|
.loop_w: |
|
|
|
|
movu m0, [srcq+0*mmsize] |
|
|
|
|
movu m1, [srcq+1*mmsize] |
|
|
|
|
movu m0, [srcq+wq+0*mmsize] |
|
|
|
|
movu m1, [srcq+wq+1*mmsize] |
|
|
|
|
|
|
|
|
|
paddd m0, m4 |
|
|
|
|
paddd m1, m4 |
|
|
|
@ -329,16 +335,13 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w |
|
|
|
|
|
|
|
|
|
movu [dstq], m0 |
|
|
|
|
|
|
|
|
|
add srcq, 2*mmsize |
|
|
|
|
add dstq, 1*mmsize |
|
|
|
|
sub wd, 8 |
|
|
|
|
jg .loop_w |
|
|
|
|
add wq, 2*mmsize |
|
|
|
|
jl .loop_w |
|
|
|
|
|
|
|
|
|
add r6, src_strideq |
|
|
|
|
add r7, dst_strideq |
|
|
|
|
add srcq, src_strideq |
|
|
|
|
add t2q, dst_strideq |
|
|
|
|
sub hd, 1 |
|
|
|
|
jg .loop_h |
|
|
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
%endif |
|
|
|
|