diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h index 48dc78e1b0..fbe7abfbb2 100644 --- a/libavutil/mips/generic_macros_msa.h +++ b/libavutil/mips/generic_macros_msa.h @@ -24,1403 +24,1391 @@ #include #include -#define LOAD_UB(psrc) \ -( { \ - v16u8 out_m; \ - out_m = *((v16u8 *) (psrc)); \ - out_m; \ -} ) - -#define LOAD_SB(psrc) \ -( { \ - v16i8 out_m; \ - out_m = *((v16i8 *) (psrc)); \ - out_m; \ -} ) +#define LD_B(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) -#define LOAD_UH(psrc) *((const v8u16 *)(psrc)) +#define LD_H(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) -#define LOAD_SH(psrc) \ -( { \ - v8i16 out_m; \ - out_m = *((v8i16 *) (psrc)); \ - out_m; \ -} ) +#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) -#define LOAD_SW(psrc) *((const v4i32 *)(psrc)) +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) -#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) -#define STORE_SH(vec, pdest) \ -{ \ - *((v8i16 *) (pdest)) = (vec); \ -} - -#define STORE_SW(vec, pdest) \ -{ \ - *((v4i32 *) (pdest)) = (vec); \ -} +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) - #define LOAD_WORD(psrc) \ - ( { \ - uint8_t *src_m = (uint8_t *) (psrc); \ - uint32_t val_m; \ - \ - __asm__ volatile ( \ - "lw %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ + #define LW(psrc) \ + ( { \ + uint8_t *psrc_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + __asm__ volatile ( \ + "lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ } ) #if (__mips == 64) - #define LOAD_DWORD(psrc) \ - ( { \ - uint8_t *src_m = (uint8_t *) (psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ volatile ( \ - "ld %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ + #define LD(psrc) \ + ( { \ + uint8_t *psrc_m = (uint8_t *) (psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ volatile ( \ + "ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ } ) - #else - #define LOAD_DWORD(psrc) \ - ( { \ - uint8_t *src1_m = (uint8_t *) (psrc); \ - uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \ - uint32_t val0_m, val1_m; \ - uint64_t genval_m = 0; \ - \ - __asm__ volatile ( \ - "lw %[val0_m], %[src1_m] \n\t" \ - \ - : [val0_m] "=r" (val0_m) \ - : [src1_m] "m" (*src1_m) \ - ); \ - \ - __asm__ volatile ( \ - "lw %[val1_m], %[src2_m] \n\t" \ - \ - : [val1_m] "=r" (val1_m) \ - : [src2_m] "m" (*src2_m) \ - ); \ - \ - genval_m = (uint64_t) (val1_m); \ - genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \ - genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \ - \ - genval_m; \ + #else // !(__mips == 64) + #define LD(psrc) \ + ( { \ + uint8_t *psrc_m = (uint8_t *) (psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t) (val1_m); \ + val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ + \ + val_m; \ } ) - #endif - - #define STORE_WORD(pdst, val) \ - { \ - uint8_t *dst_ptr_m = (uint8_t *) (pdst); \ - uint32_t val_m = (val); \ - \ - __asm__ volatile ( \ - "sw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ + #endif // (__mips == 64) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + __asm__ volatile ( \ + "sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } - #define STORE_DWORD(pdst, val) \ - { \ - uint8_t *dst_ptr_m = (uint8_t *) (pdst); \ - uint64_t val_m = (val); \ - \ - __asm__ volatile ( \ - "sd %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ - } - #define STORE_HWORD(pdst, val) \ - { \ - uint8_t *dst_ptr_m = (uint8_t *) (pdst); \ - uint16_t val_m = (val); \ - \ - __asm__ volatile ( \ - "sh %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ + #define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + __asm__ volatile ( \ + "sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } -#else - #define LOAD_WORD(psrc) \ - ( { \ - uint8_t *src_m = (uint8_t *) (psrc); \ - uint32_t val_m; \ - \ - __asm__ volatile ( \ - "ulw %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ + #define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + uint64_t val_m = (val); \ + \ + __asm__ volatile ( \ + "sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ + } +#else // !(__mips_isa_rev >= 6) + #define LW(psrc) \ + ( { \ + uint8_t *psrc_m = (uint8_t *) (psrc); \ + uint32_t val_m; \ + \ + __asm__ volatile ( \ + "ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ } ) #if (__mips == 64) - #define LOAD_DWORD(psrc) \ - ( { \ - uint8_t *src_m = (uint8_t *) (psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ volatile ( \ - "uld %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ + #define LD(psrc) \ + ( { \ + uint8_t *psrc_m = (uint8_t *) (psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ volatile ( \ + "uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ } ) - #else - #define LOAD_DWORD(psrc) \ - ( { \ - uint8_t *src1_m = (uint8_t *) (psrc); \ - uint8_t *src2_m = ((uint8_t *) (psrc)) + 4; \ - uint32_t val0_m, val1_m; \ - uint64_t genval_m = 0; \ - \ - __asm__ volatile ( \ - "ulw %[val0_m], %[src1_m] \n\t" \ - \ - : [val0_m] "=r" (val0_m) \ - : [src1_m] "m" (*src1_m) \ - ); \ - \ - __asm__ volatile ( \ - "ulw %[val1_m], %[src2_m] \n\t" \ - \ - : [val1_m] "=r" (val1_m) \ - : [src2_m] "m" (*src2_m) \ - ); \ - \ - genval_m = (uint64_t) (val1_m); \ - genval_m = (uint64_t) ((genval_m << 32) & 0xFFFFFFFF00000000); \ - genval_m = (uint64_t) (genval_m | (uint64_t) val0_m); \ - \ - genval_m; \ + #else // !(__mips == 64) + #define LD(psrc) \ + ( { \ + uint8_t *psrc_m1 = (uint8_t *) (psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m = (uint64_t) (val1_m); \ + val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ + \ + val_m; \ } ) - #endif - - #define STORE_WORD(pdst, val) \ - { \ - uint8_t *dst_ptr_m = (uint8_t *) (pdst); \ - uint32_t val_m = (val); \ - \ - __asm__ volatile ( \ - "usw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ + #endif // (__mips == 64) + + #define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + uint16_t val_m = (val); \ + \ + __asm__ volatile ( \ + "ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ + } + + #define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *) (pdst); \ + uint32_t val_m = (val); \ + \ + __asm__ volatile ( \ + "usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } - #define STORE_DWORD(pdst, val) \ + #define SD(val, pdst) \ { \ - uint8_t *dst1_m = (uint8_t *) (pdst); \ - uint8_t *dst2_m = ((uint8_t *) (pdst)) + 4; \ + uint8_t *pdst_m1 = (uint8_t *) (pdst); \ uint32_t val0_m, val1_m; \ \ val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ \ - __asm__ volatile ( \ - "usw %[val0_m], %[dst1_m] \n\t" \ - "usw %[val1_m], %[dst2_m] \n\t" \ - \ - : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ - : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ - ); \ - } - - #define STORE_HWORD(pdst, val) \ - { \ - uint8_t *dst_ptr_m = (uint8_t *) (pdst); \ - uint16_t val_m = (val); \ - \ - __asm__ volatile ( \ - "ush %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ } - -#endif - -#define LOAD_4WORDS_WITH_STRIDE(psrc, src_stride, \ - src0, src1, src2, src3) \ -{ \ - src0 = LOAD_WORD(psrc + 0 * src_stride); \ - src1 = LOAD_WORD(psrc + 1 * src_stride); \ - src2 = LOAD_WORD(psrc + 2 * src_stride); \ - src3 = LOAD_WORD(psrc + 3 * src_stride); \ -} - -#define LOAD_2VECS_UB(psrc, stride, \ - val0, val1) \ -{ \ - val0 = LOAD_UB(psrc + 0 * stride); \ - val1 = LOAD_UB(psrc + 1 * stride); \ -} - -#define LOAD_2VECS_SB(psrc, stride, \ - val0, val1) \ -{ \ - val0 = LOAD_SB(psrc + 0 * stride); \ - val1 = LOAD_SB(psrc + 1 * stride); \ -} - -#define LOAD_3VECS_UB(psrc, stride, \ - val0, val1, val2) \ -{ \ - val0 = LOAD_UB(psrc + 0 * stride); \ - val1 = LOAD_UB(psrc + 1 * stride); \ - val2 = LOAD_UB(psrc + 2 * stride); \ -} - -#define LOAD_3VECS_SB(psrc, stride, \ - val0, val1, val2) \ -{ \ - val0 = LOAD_SB(psrc + 0 * stride); \ - val1 = LOAD_SB(psrc + 1 * stride); \ - val2 = LOAD_SB(psrc + 2 * stride); \ -} - -#define LOAD_4VECS_UB(psrc, stride, \ - val0, val1, val2, val3) \ -{ \ - val0 = LOAD_UB(psrc + 0 * stride); \ - val1 = LOAD_UB(psrc + 1 * stride); \ - val2 = LOAD_UB(psrc + 2 * stride); \ - val3 = LOAD_UB(psrc + 3 * stride); \ -} - -#define LOAD_4VECS_SB(psrc, stride, \ - val0, val1, val2, val3) \ -{ \ - val0 = LOAD_SB(psrc + 0 * stride); \ - val1 = LOAD_SB(psrc + 1 * stride); \ - val2 = LOAD_SB(psrc + 2 * stride); \ - val3 = LOAD_SB(psrc + 3 * stride); \ -} - -#define LOAD_5VECS_UB(psrc, stride, \ - out0, out1, out2, out3, out4) \ -{ \ - LOAD_4VECS_UB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - out4 = LOAD_UB(psrc + 4 * stride); \ -} - -#define LOAD_5VECS_SB(psrc, stride, \ - out0, out1, out2, out3, out4) \ -{ \ - LOAD_4VECS_SB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - out4 = LOAD_SB(psrc + 4 * stride); \ -} - -#define LOAD_6VECS_SB(psrc, stride, \ - out0, out1, out2, out3, out4, out5) \ -{ \ - LOAD_4VECS_SB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - LOAD_2VECS_SB((psrc + 4 * stride), (stride), \ - (out4), (out5)); \ -} - -#define LOAD_7VECS_UB(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6) \ +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1, out2, out3 + Details : Loads word in 'out0' from (psrc) + Loads word in 'out1' from (psrc + stride) + Loads word in 'out2' from (psrc + 2 * stride) + Loads word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ +{ \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ +} + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Stores word from 'in0' to (pdst) + Stores word from 'in1' to (pdst + stride) + Stores word from 'in2' to (pdst + 2 * stride) + Stores word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ { \ - val0 = LOAD_UB((psrc) + 0 * (stride)); \ - val1 = LOAD_UB((psrc) + 1 * (stride)); \ - val2 = LOAD_UB((psrc) + 2 * (stride)); \ - val3 = LOAD_UB((psrc) + 3 * (stride)); \ - val4 = LOAD_UB((psrc) + 4 * (stride)); \ - val5 = LOAD_UB((psrc) + 5 * (stride)); \ - val6 = LOAD_UB((psrc) + 6 * (stride)); \ -} - -#define LOAD_7VECS_SB(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6) \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ +} + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Stores double word from 'in0' to (pdst) + Stores double word from 'in1' to (pdst + stride) + Stores double word from 'in2' to (pdst + 2 * stride) + Stores double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ { \ - val0 = LOAD_SB((psrc) + 0 * (stride)); \ - val1 = LOAD_SB((psrc) + 1 * (stride)); \ - val2 = LOAD_SB((psrc) + 2 * (stride)); \ - val3 = LOAD_SB((psrc) + 3 * (stride)); \ - val4 = LOAD_SB((psrc) + 4 * (stride)); \ - val5 = LOAD_SB((psrc) + 5 * (stride)); \ - val6 = LOAD_SB((psrc) + 6 * (stride)); \ -} - -#define LOAD_8VECS_UB(psrc, stride, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - LOAD_4VECS_UB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ - (out4), (out5), (out6), (out7)); \ -} - -#define LOAD_8VECS_SB(psrc, stride, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - LOAD_4VECS_SB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ - (out4), (out5), (out6), (out7)); \ -} - -#define LOAD_2VECS_UH(psrc, stride, \ - val0, val1) \ -{ \ - val0 = LOAD_UH((psrc) + 0 * (stride)); \ - val1 = LOAD_UH((psrc) + 1 * (stride)); \ -} - -#define LOAD_2VECS_SH(psrc, stride, \ - val0, val1) \ -{ \ - val0 = LOAD_SH((psrc) + 0 * (stride)); \ - val1 = LOAD_SH((psrc) + 1 * (stride)); \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ +} + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Loads 16 byte elements in 'out0' from (psrc) + Loads 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +{ \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ +} +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ +{ \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ +} +#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) -#define LOAD_4VECS_UH(psrc, stride, \ - val0, val1, val2, val3) \ -{ \ - LOAD_2VECS_UH((psrc), (stride), val0, val1); \ - LOAD_2VECS_UH((psrc + 2 * stride), (stride), val2, val3); \ +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ +{ \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) -#define LOAD_4VECS_SH(psrc, stride, \ - val0, val1, val2, val3) \ -{ \ - LOAD_2VECS_SH((psrc), (stride), val0, val1); \ - LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ +#define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ +{ \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ } +#define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__) -#define LOAD_6VECS_SH(psrc, stride, \ - val0, val1, val2, val3, val4, val5) \ -{ \ - LOAD_2VECS_SH((psrc), (stride), val0, val1); \ - LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ - LOAD_2VECS_SH((psrc + 4 * stride), (stride), val4, val5); \ -} - -#define LOAD_8VECS_UH(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6, val7) \ -{ \ - LOAD_4VECS_UH((psrc), (stride), \ - val0, val1, val2, val3); \ - LOAD_4VECS_UH((psrc + 4 * stride), (stride), \ - val4, val5, val6, val7); \ -} - -#define LOAD_8VECS_SH(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6, val7) \ -{ \ - LOAD_4VECS_SH((psrc), (stride), \ - val0, val1, val2, val3); \ - LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ - val4, val5, val6, val7); \ -} - -#define LOAD_16VECS_SH(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6, val7, \ - val8, val9, val10, val11, \ - val12, val13, val14, val15) \ -{ \ - LOAD_8VECS_SH((psrc), (stride), \ - val0, val1, val2, val3, \ - val4, val5, val6, val7); \ - LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ - val8, val9, val10, val11, \ - val12, val13, val14, val15); \ -} - -#define STORE_4VECS_UB(dst_out, pitch, \ - in0, in1, in2, in3) \ -{ \ - STORE_UB((in0), (dst_out)); \ - STORE_UB((in1), ((dst_out) + (pitch))); \ - STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ - STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ -} - -#define STORE_4VECS_SB(dst_out, pitch, \ - in0, in1, in2, in3) \ -{ \ - STORE_SB((in0), (dst_out)); \ - STORE_SB((in1), ((dst_out) + (pitch))); \ - STORE_SB((in2), ((dst_out) + 2 * (pitch))); \ - STORE_SB((in3), ((dst_out) + 3 * (pitch))); \ -} - -#define STORE_8VECS_UB(dst_out, pitch_in, \ - in0, in1, in2, in3, \ - in4, in5, in6, in7) \ +#define LD_B7(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6) \ +{ \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ +} +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ +} +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, stride + Outputs - pdst (destination pointer to store to) + Details : Stores 16 byte elements from 'in0' to (pdst) + Stores 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ +{ \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) +#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ { \ - STORE_4VECS_UB(dst_out, pitch_in, \ - in0, in1, in2, in3); \ - STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ - in4, in5, in6, in7); \ -} - -#define STORE_2VECS_SH(ptr, stride, \ - in0, in1) \ -{ \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) +#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, stride + Outputs - pdst (destination pointer to store to) + Details : Stores 8 halfword elements from 'in0' to (pdst) + Stores 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ +{ \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ } +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) -#define STORE_4VECS_SH(ptr, stride, \ - in0, in1, in2, in3) \ -{ \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ - STORE_SH(in2, ((ptr) + 2 * stride)); \ - STORE_SH(in3, ((ptr) + 3 * stride)); \ +#define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ +{ \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ +} +#define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, stride + Outputs - pdst (destination pointer to store to) + Return Type - signed word + Details : Stores 4 word elements from 'in0' to (pdst) + Stores 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ +{ \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ +} + +/* Description : Store as 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Return Type - unsigned byte + Details : Index stidx halfword element from 'in' vector is copied and + stored on first line + Index stidx+1 halfword element from 'in' vector is copied and + stored on second line + Index stidx+2 halfword element from 'in' vector is copied and + stored on third line + Index stidx+3 halfword element from 'in' vector is copied and + stored on fourth line +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ +{ \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ +} + +/* Description : Store as 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Return Type - unsigned byte + Details : Idx0 word element from input vector 'in0' is copied and stored + on first line + Idx1 word element from input vector 'in0' is copied and stored + on second line + Idx2 word element from input vector 'in1' is copied and stored + on third line + Idx3 word element from input vector 'in1' is copied and stored + on fourth line +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ +{ \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32) in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32) in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32) in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32) in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ } -#define STORE_6VECS_SH(ptr, stride, \ - in0, in1, in2, in3, \ - in4, in5) \ -{ \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ - STORE_SH(in2, ((ptr) + 2 * stride)); \ - STORE_SH(in3, ((ptr) + 3 * stride)); \ - STORE_SH(in4, ((ptr) + 4 * stride)); \ - STORE_SH(in5, ((ptr) + 5 * stride)); \ +/* Description : Store as 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from input vector 'in' is copied + and stored to destination memory at (pdst) + Index 1 double word element from input vector 'in' is copied + and stored to destination memory at (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ +{ \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64) in, 0); \ + out1_m = __msa_copy_u_d((v2i64) in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + +/* Description : Store as 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from input vector 'in0' is copied + and stored to destination memory at (pblk_8x4_m) + Index 1 double word element from input vector 'in0' is copied + and stored to destination memory at (pblk_8x4_m + stride) + Index 0 double word element from input vector 'in1' is copied + and stored to destination memory at (pblk_8x4_m + 2 * stride) + Index 1 double word element from input vector 'in1' is copied + and stored to destination memory at (pblk_8x4_m + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ +{ \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64) in0, 0); \ + out1_m = __msa_copy_u_d((v2i64) in0, 1); \ + out2_m = __msa_copy_u_d((v2i64) in1, 0); \ + out3_m = __msa_copy_u_d((v2i64) in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ } - -#define STORE_8VECS_SH(ptr, stride, \ - in0, in1, in2, in3, \ - in4, in5, in6, in7) \ -{ \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ - STORE_SH(in2, ((ptr) + 2 * stride)); \ - STORE_SH(in3, ((ptr) + 3 * stride)); \ - STORE_SH(in4, ((ptr) + 4 * stride)); \ - STORE_SH(in5, ((ptr) + 5 * stride)); \ - STORE_SH(in6, ((ptr) + 6 * stride)); \ - STORE_SH(in7, ((ptr) + 7 * stride)); \ +#define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \ +{ \ + uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \ + \ + ST8x4_UB(in0, in1, pblk_8x8_m, stride); \ + ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \ +} + +/* Description : Store as 12x8 byte block to destination memory from + input vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride + Details : Index 0 double word element from input vector 'in0' is copied + and stored to destination memory at (pblk_12x8_m) followed by + index 2 word element from same input vector 'in0' at + (pblk_12x8_m + 8) + Similar to remaining lines +*/ +#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint64_t out4_m, out5_m, out6_m, out7_m; \ + uint32_t out8_m, out9_m, out10_m, out11_m; \ + uint32_t out12_m, out13_m, out14_m, out15_m; \ + uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64) in0, 0); \ + out1_m = __msa_copy_u_d((v2i64) in1, 0); \ + out2_m = __msa_copy_u_d((v2i64) in2, 0); \ + out3_m = __msa_copy_u_d((v2i64) in3, 0); \ + out4_m = __msa_copy_u_d((v2i64) in4, 0); \ + out5_m = __msa_copy_u_d((v2i64) in5, 0); \ + out6_m = __msa_copy_u_d((v2i64) in6, 0); \ + out7_m = __msa_copy_u_d((v2i64) in7, 0); \ + \ + out8_m = __msa_copy_u_w((v4i32) in0, 2); \ + out9_m = __msa_copy_u_w((v4i32) in1, 2); \ + out10_m = __msa_copy_u_w((v4i32) in2, 2); \ + out11_m = __msa_copy_u_w((v4i32) in3, 2); \ + out12_m = __msa_copy_u_w((v4i32) in4, 2); \ + out13_m = __msa_copy_u_w((v4i32) in5, 2); \ + out14_m = __msa_copy_u_w((v4i32) in6, 2); \ + out15_m = __msa_copy_u_w((v4i32) in7, 2); \ + \ + SD(out0_m, pblk_12x8_m); \ + SW(out8_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out1_m, pblk_12x8_m); \ + SW(out9_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out2_m, pblk_12x8_m); \ + SW(out10_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out3_m, pblk_12x8_m); \ + SW(out11_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out4_m, pblk_12x8_m); \ + SW(out12_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out5_m, pblk_12x8_m); \ + SW(out13_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out6_m, pblk_12x8_m); \ + SW(out14_m, pblk_12x8_m + 8); \ + pblk_12x8_m += stride; \ + SD(out7_m, pblk_12x8_m); \ + SW(out15_m, pblk_12x8_m + 8); \ +} + +/* Description : Immediate number of columns to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slide into 'in0' by + number of elements specified by 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ +{ \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \ + out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \ +} +#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, slide_val) \ +{ \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ +} +#define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Selective byte elements from in0 & in1 are copied to out0 as + per control vector mask0 + Selective byte elements from in2 & in3 are copied to out1 as + per control vector mask1 +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ + out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ } +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) -#define CLIP_MIN_TO_MAX_H(in, min, max) \ -( { \ - v8i16 out_m; \ - \ - out_m = __msa_max_s_h((v8i16) (min), (v8i16) (in)); \ - out_m = __msa_min_s_h((v8i16) (max), (v8i16) out_m); \ - out_m; \ +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ + out0, out1, out2, out3) \ +{ \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ +} +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed halfword + Details : Signed byte elements from mult0 are multiplied with + signed byte elements from cnst0 producing a result + twice the size of input i.e. signed halfword. + Then this multiplication results of adjacent odd-even elements + are added to the out vector + (2 signed halfword results) +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \ + (v16i8) mult0, (v16i8) cnst0); \ + out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \ + (v16i8) mult1, (v16i8) cnst1); \ +} +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ +{ \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed word + Details : Signed halfword elements from mult0 are multiplied with + signed halfword elements from cnst0 producing a result + twice the size of input i.e. signed word. + Then this multiplication results of adjacent odd-even elements + are added to the out vector + (2 signed word results) +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \ + (v8i16) mult0, (v8i16) cnst0); \ + out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \ + (v8i16) mult1, (v8i16) cnst1); \ +} +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Clips all halfword elements of input vector between min & max + out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) + Arguments : Inputs - in (input vector) + - min (min threshold) + - max (max threshold) + Outputs - out_m (output vector with clipped elements) + Return Type - signed halfword +*/ +#define CLIP_SH(in, min, max) \ +( { \ + v8i16 out_m; \ + \ + out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \ + out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \ + out_m; \ } ) -#define CLIP_UNSIGNED_CHAR_H(in) \ +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Inputs - in (input vector) + Outputs - out_m (output vector with clipped elements) + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ ( { \ v8i16 max_m = __msa_ldi_h(255); \ v8i16 out_m; \ \ - out_m = __msa_maxi_s_h((v8i16) (in), 0); \ + out_m = __msa_maxi_s_h((v8i16) in, 0); \ out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \ out_m; \ } ) - -#define CLIP_UNSIGNED_CHAR_W(in) \ -( { \ - v4i32 max_m = __msa_ldi_w(255); \ - v4i32 out_m; \ - \ - out_m = __msa_maxi_s_w((v4i32) (in), 0); \ - out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \ - out_m; \ -} ) - -#define TRANSPOSE4x4_B_UB(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ -{ \ - v16i8 zero_m = { 0 }; \ - v16i8 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = (v16i8) __msa_ilvr_d((v2i64) (in1), (v2i64) (in0)); \ - s1_m = (v16i8) __msa_ilvr_d((v2i64) (in3), (v2i64) (in2)); \ - s2_m = __msa_ilvr_b(s1_m, s0_m); \ - s3_m = __msa_ilvl_b(s1_m, s0_m); \ - \ - out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \ - out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \ - out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \ - out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \ -} - -#define TRANSPOSE8x4_B_UB(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3) \ -{ \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \ - tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \ - tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ - tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \ - tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \ - \ - tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ - tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \ - tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \ - \ - out0 = (v16u8) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \ - out2 = (v16u8) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \ - out1 = (v16u8) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ - out3 = (v16u8) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ -} - -#define TRANSPOSE8x4_B_UH(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3) \ -{ \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in4), (v4i32) (in0)); \ - tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in5), (v4i32) (in1)); \ - tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ - tmp0_m = (v16i8) __msa_ilvev_w((v4i32) (in6), (v4i32) (in2)); \ - tmp1_m = (v16i8) __msa_ilvev_w((v4i32) (in7), (v4i32) (in3)); \ - \ - tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ - tmp0_m = (v16i8) __msa_ilvr_h((v8i16) tmp3_m, (v8i16) tmp2_m); \ - tmp1_m = (v16i8) __msa_ilvl_h((v8i16) tmp3_m, (v8i16) tmp2_m); \ - \ - out0 = (v8u16) __msa_ilvr_w((v4i32) tmp1_m, (v4i32) tmp0_m); \ - out2 = (v8u16) __msa_ilvl_w((v4i32) tmp1_m, (v4i32) tmp0_m); \ - out1 = (v8u16) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ - out3 = (v8u16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ -} - -#define TRANSPOSE8x8_B_UB(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - v16i8 zero_m = { 0 }; \ - \ - tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \ - tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \ - tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \ - tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \ - \ - tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \ - tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \ - tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ - tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ - \ - out0 = (v16u8) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \ - out2 = (v16u8) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \ - out4 = (v16u8) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \ - out6 = (v16u8) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \ - \ - out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 8); \ - out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 8); \ - out5 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out4, 8); \ - out7 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out6, 8); \ +#define CLIP_SH2_0_255(in0, in1) \ +{ \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ } - -#define TRANSPOSE8x8_B_UH(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - v16i8 zero_m = { 0 }; \ - \ - tmp0_m = __msa_ilvr_b((v16i8) (in2), (v16i8) (in0)); \ - tmp1_m = __msa_ilvr_b((v16i8) (in3), (v16i8) (in1)); \ - tmp2_m = __msa_ilvr_b((v16i8) (in6), (v16i8) (in4)); \ - tmp3_m = __msa_ilvr_b((v16i8) (in7), (v16i8) (in5)); \ - \ - tmp4_m = __msa_ilvr_b((v16i8) tmp1_m, (v16i8) tmp0_m); \ - tmp5_m = __msa_ilvl_b((v16i8) tmp1_m, (v16i8) tmp0_m); \ - tmp6_m = __msa_ilvr_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ - tmp7_m = __msa_ilvl_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ - \ - out0 = (v8u16) __msa_ilvr_w((v4i32) tmp6_m, (v4i32) tmp4_m); \ - out2 = (v8u16) __msa_ilvl_w((v4i32) tmp6_m, (v4i32) tmp4_m); \ - out4 = (v8u16) __msa_ilvr_w((v4i32) tmp7_m, (v4i32) tmp5_m); \ - out6 = (v8u16) __msa_ilvl_w((v4i32) tmp7_m, (v4i32) tmp5_m); \ - out1 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out0, 8); \ - out3 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out2, 8); \ - out5 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out4, 8); \ - out7 = (v8u16) __msa_sldi_b(zero_m, (v16i8) out6, 8); \ -} - -#define TRANSPOSE16x8_B_UB(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - in8, in9, in10, in11, \ - in12, in13, in14, in15, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - (out7) = (v16u8) __msa_ilvev_d((v2i64) (in8), (v2i64) (in0)); \ - (out6) = (v16u8) __msa_ilvev_d((v2i64) (in9), (v2i64) (in1)); \ - (out5) = (v16u8) __msa_ilvev_d((v2i64) (in10), (v2i64) (in2)); \ - (out4) = (v16u8) __msa_ilvev_d((v2i64) (in11), (v2i64) (in3)); \ - (out3) = (v16u8) __msa_ilvev_d((v2i64) (in12), (v2i64) (in4)); \ - (out2) = (v16u8) __msa_ilvev_d((v2i64) (in13), (v2i64) (in5)); \ - (out1) = (v16u8) __msa_ilvev_d((v2i64) (in14), (v2i64) (in6)); \ - (out0) = (v16u8) __msa_ilvev_d((v2i64) (in15), (v2i64) (in7)); \ - \ - tmp0_m = (v16u8) __msa_ilvev_b((v16i8) (out6), (v16i8) (out7)); \ - tmp4_m = (v16u8) __msa_ilvod_b((v16i8) (out6), (v16i8) (out7)); \ - tmp1_m = (v16u8) __msa_ilvev_b((v16i8) (out4), (v16i8) (out5)); \ - tmp5_m = (v16u8) __msa_ilvod_b((v16i8) (out4), (v16i8) (out5)); \ - (out5) = (v16u8) __msa_ilvev_b((v16i8) (out2), (v16i8) (out3)); \ - tmp6_m = (v16u8) __msa_ilvod_b((v16i8) (out2), (v16i8) (out3)); \ - (out7) = (v16u8) __msa_ilvev_b((v16i8) (out0), (v16i8) (out1)); \ - tmp7_m = (v16u8) __msa_ilvod_b((v16i8) (out0), (v16i8) (out1)); \ - \ - tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ - tmp3_m = (v16u8) __msa_ilvev_h((v8i16) (out7), (v8i16) (out5)); \ - (out0) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - (out4) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - \ - tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ - tmp3_m = (v16u8) __msa_ilvod_h((v8i16) (out7), (v8i16) (out5)); \ - (out2) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - (out6) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - \ - tmp2_m = (v16u8) __msa_ilvev_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ - tmp3_m = (v16u8) __msa_ilvev_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ - (out1) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - (out5) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - \ - tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ - tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ - tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ - tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ - (out3) = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ - (out7) = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ -} - -#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - s0_m = __msa_ilvr_h((v8i16) (in6), (v8i16) (in4)); \ - s1_m = __msa_ilvr_h((v8i16) (in7), (v8i16) (in5)); \ - tmp0_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \ - tmp1_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \ - \ - s0_m = __msa_ilvl_h((v8i16) (in6), (v8i16) (in4)); \ - s1_m = __msa_ilvl_h((v8i16) (in7), (v8i16) (in5)); \ - tmp2_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \ - tmp3_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \ - \ - s0_m = __msa_ilvr_h((v8i16) (in2), (v8i16) (in0)); \ - s1_m = __msa_ilvr_h((v8i16) (in3), (v8i16) (in1)); \ - tmp4_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \ - tmp5_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \ - \ - s0_m = __msa_ilvl_h((v8i16) (in2), (v8i16) (in0)); \ - s1_m = __msa_ilvl_h((v8i16) (in3), (v8i16) (in1)); \ - tmp6_m = __msa_ilvr_h((v8i16) s1_m, (v8i16) s0_m); \ - tmp7_m = __msa_ilvl_h((v8i16) s1_m, (v8i16) s0_m); \ - \ - out0 = (v8i16) __msa_pckev_d((v2i64) tmp0_m, (v2i64) tmp4_m); \ - out1 = (v8i16) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \ - out2 = (v8i16) __msa_pckev_d((v2i64) tmp1_m, (v2i64) tmp5_m); \ - out3 = (v8i16) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \ - out4 = (v8i16) __msa_pckev_d((v2i64) tmp2_m, (v2i64) tmp6_m); \ - out5 = (v8i16) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \ - out6 = (v8i16) __msa_pckev_d((v2i64) tmp3_m, (v2i64) tmp7_m); \ - out7 = (v8i16) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \ -} - -#define TRANSPOSE4x4_W(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ +{ \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ +} + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is stored in 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ { \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = __msa_ilvr_w((v4i32) (in1), (v4i32) (in0)); \ - s1_m = __msa_ilvl_w((v4i32) (in1), (v4i32) (in0)); \ - s2_m = __msa_ilvr_w((v4i32) (in3), (v4i32) (in2)); \ - s3_m = __msa_ilvl_w((v4i32) (in3), (v4i32) (in2)); \ - \ - out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \ - out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \ - out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ - out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ -} - -#define ILV_B_LRLR_SB(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ -{ \ - out0 = __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \ - out1 = __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \ - out2 = __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \ - out3 = __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \ + out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \ + out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \ +} +#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and even halfword + elements of 'in1' are interleaved and copied to 'out0' + Even halfword elements of 'in2' and even halfword + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \ + out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \ +} +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and even word + elements of 'in1' are interleaved and copied to 'out0' + Even word elements of 'in2' and even word + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \ + out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \ +} +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and even double word + elements of 'in1' are interleaved and copied to 'out0' + Even double word elements of 'in2' and even double word + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \ + out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \ +} +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of in0 and left half of byte + elements of in1 are interleaved and copied to out0. + Left half of byte elements of in2 and left half of byte + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \ } +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) -#define ILV_B_LRLR_UH(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ { \ - out0 = (v8u16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \ - out1 = (v8u16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \ - out2 = (v8u16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \ - out3 = (v8u16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of in0 and left half of halfword + elements of in1 are interleaved and copied to out0. + Left half of halfword elements of in2 and left half of halfword + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ + out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \ } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) -#define ILV_B_LRLR_SH(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ +#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ { \ - out0 = (v8i16) __msa_ilvl_b((v16i8) (in1), (v16i8) (in0)); \ - out1 = (v8i16) __msa_ilvr_b((v16i8) (in1), (v16i8) (in0)); \ - out2 = (v8i16) __msa_ilvl_b((v16i8) (in3), (v16i8) (in2)); \ - out3 = (v8i16) __msa_ilvr_b((v16i8) (in3), (v16i8) (in2)); \ + ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of in0 and left half of word + elements of in1 are interleaved and copied to out0. + Left half of word elements of in2 and left half of word + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \ +} +#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Right half of byte elements of in0 and right half of byte + elements of in1 are interleaved and copied to out0. + Right half of byte elements of in2 and right half of byte + elements of in3 are interleaved and copied to out1. + Similar for other pairs +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \ } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) -#define ILV_H_LRLR_SW(in0, in1, in2, in3, \ - out0, out1, out2, out3) \ +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ { \ - out0 = (v4i32) __msa_ilvl_h((v8i16) (in1), (v8i16) (in0)); \ - out1 = (v4i32) __msa_ilvr_h((v8i16) (in1), (v8i16) (in0)); \ - out2 = (v4i32) __msa_ilvl_h((v8i16) (in3), (v8i16) (in2)); \ - out3 = (v4i32) __msa_ilvr_h((v8i16) (in3), (v8i16) (in2)); \ -} - -#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ -{ \ - out0 = (v16u8) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = (v16u8) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \ -} - -#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword + Details : Right half of halfword elements of in0 and right half of + halfword elements of in1 are interleaved and copied to out0. + Right half of halfword elements of in2 and right half of + halfword elements of in3 are interleaved and copied to out1. + Similar for other pairs +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ { \ - out0 = __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \ + out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ + out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \ } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) -#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ -{ \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) \ -{ \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in4_r, in5_r, in6_r, in7_r, \ - in0_l, in1_l, in2_l, in3_l, \ - in4_l, in5_l, in6_l, in7_l, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ - ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ - out6, out7); \ -} - -#define ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ -{ \ - out0 = (v8u16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = (v8u16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \ +#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ +{ \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \ } +#define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__) -#define ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ -{ \ - out0 = (v8i16) __msa_ilvr_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = (v8i16) __msa_ilvr_b((v16i8) (in1_l), (v16i8) (in1_r)); \ +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) -#define ILVR_B_3VECS_SH(in0_r, in1_r, in2_r, in0_l, in1_l, in2_l, \ - out0, out1, out2) \ -{ \ - ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, out0, out1); \ - out2 = (v8i16) __msa_ilvr_b((v16i8) (in2_l), (v16i8) (in2_r)); \ +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \ } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__) -#define ILVR_B_4VECS_UH(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ -{ \ - ILVR_B_2VECS_UH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_UH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - unsigned double word + Details : Right half of double word elements of in0 and right half of + double word elements of in1 are interleaved and copied to out0. + Right half of double word elements of in2 and right half of + double word elements of in3 are interleaved and copied to out1. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \ + out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \ } +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) -#define ILVR_B_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ -{ \ - ILVR_B_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ +{ \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \ } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) -#define ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and stored to 'out0' + Left half of byte elements from 'in0' and 'in1' are + interleaved and stored to 'out1' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ { \ - out0 = __msa_ilvr_h((v8i16) (in0_l), (v8i16) (in0_r)); \ - out1 = __msa_ilvr_h((v8i16) (in1_l), (v8i16) (in1_r)); \ + out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ } +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) -#define ILVR_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ -{ \ - ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVR_H_6VECS_SH(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) \ -{ \ - ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVR_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \ - in4_r, in5_r, in6_r, in7_r, \ - in0_l, in1_l, in2_l, in3_l, \ - in4_l, in5_l, in6_l, in7_l, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - ILVR_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ - ILVR_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \ - out6, out7); \ -} - -#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ { \ - out0 = __msa_ilvl_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = __msa_ilvl_b((v16i8) (in1_l), (v16i8) (in1_r)); \ + out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \ + out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \ } +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) -#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ -{ \ - ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) \ -{ \ - ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) \ +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ { \ - out0 = __msa_ilvl_h((v8i16) (in0_l), (v8i16) (in0_r)); \ - out1 = __msa_ilvl_h((v8i16) (in1_l), (v8i16) (in1_r)); \ + out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ + out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ +} +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) + +/* Description : Maximum values between signed elements of vector and + 5-bit signed immediate value are copied to the output vector + Arguments : Inputs - in0, in1, in2, in3, max_val + Outputs - in0, in1, in2, in3 (in place) + Return Type - unsigned halfword + Details : Maximum of signed halfword element values from 'in0' and + 'max_val' are written to output vector 'in0' +*/ +#define MAXI_SH2(RTYPE, in0, in1, max_val) \ +{ \ + in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \ + in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \ +} +#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) + +#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \ +{ \ + MAXI_SH2(RTYPE, in0, in1, max_val); \ + MAXI_SH2(RTYPE, in2, in3, max_val); \ +} +#define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val+1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, in2, in3, sat_val + Outputs - in0, in1, in2, in3 (in place) + Return Type - unsigned halfword + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val+1) bit range + Results are in placed to original vectors +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ +{ \ + in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \ + in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \ } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) -#define ILVL_H_4VECS_SH(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) \ +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ { \ - ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVL_H_6VECS_SH(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) \ -{ \ - ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVL_H_8VECS_SH(in0_r, in1_r, in2_r, in3_r, \ - in4_r, in5_r, in6_r, in7_r, \ - in0_l, in1_l, in2_l, in3_l, \ - in4_l, in5_l, in6_l, in7_l, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) \ -{ \ - ILVL_H_2VECS_SH(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_H_2VECS_SH(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVL_H_2VECS_SH(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ - ILVL_H_2VECS_SH(in6_r, in7_r, in6_l, in7_l, \ - out6, out7); \ -} - -#define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r) \ -{ \ - out0 = (v16i8) __msa_ilvr_d((v2i64) (in0_l), (v2i64) (in0_r)); \ - out1 = (v16i8) __msa_ilvr_d((v2i64) (in1_l), (v2i64) (in1_r)); \ -} - -#define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r, \ - out2, in2_l, in2_r) \ -{ \ - ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r); \ - out2 = (v16i8) __msa_ilvr_d((v2i64) (in2_l), (v2i64) (in2_r)); \ -} - -#define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r, \ - out2, in2_l, in2_r, \ - out3, in3_l, in3_r) \ -{ \ - ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r); \ - ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ - out3, in3_l, in3_r); \ -} - -#define MAXI_S_H_4VECS_UH(vec0, vec1, vec2, vec3, max_value) \ -{ \ - vec0 = (v8u16) __msa_maxi_s_h((v8i16) (vec0), (max_value)); \ - vec1 = (v8u16) __msa_maxi_s_h((v8i16) (vec1), (max_value)); \ - vec2 = (v8u16) __msa_maxi_s_h((v8i16) (vec2), (max_value)); \ - vec3 = (v8u16) __msa_maxi_s_h((v8i16) (vec3), (max_value)); \ -} - -#define SAT_U_H_4VECS_UH(vec0, vec1, vec2, vec3, sat_value) \ -{ \ - vec0 = __msa_sat_u_h((v8u16) (vec0), (sat_value)); \ - vec1 = __msa_sat_u_h((v8u16) (vec1), (sat_value)); \ - vec2 = __msa_sat_u_h((v8u16) (vec2), (sat_value)); \ - vec3 = __msa_sat_u_h((v8u16) (vec3), (sat_value)); \ -} - -#define PCKEV_B_4VECS_UB(in0_l, in1_l, in2_l, in3_l, \ - in0_r, in1_r, in2_r, in3_r, \ - out0, out1, out2, out3) \ -{ \ - out0 = (v16u8) __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = (v16u8) __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \ - out2 = (v16u8) __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \ - out3 = (v16u8) __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \ -} - -#define PCKEV_B_4VECS_SB(in0_l, in1_l, in2_l, in3_l, \ - in0_r, in1_r, in2_r, in3_r, \ - out0, out1, out2, out3) \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ +} +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \ + out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \ +} +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ + out0, out1, out2, out3) \ +{ \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ +} +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for halfword operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ + out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ +} +#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) + +#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ +{ \ + SPLATI_W2(RTYPE, in, 0, out0, out1); \ + SPLATI_W2(RTYPE, in, 2, out2, out3); \ +} +#define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of in0 are copied to the left half of + out0 & even byte elements of in1 are copied to the right + half of out0. + Even byte elements of in2 are copied to the left half of + out1 & even byte elements of in3 are copied to the right + half of out1. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ { \ - out0 = __msa_pckev_b((v16i8) (in0_l), (v16i8) (in0_r)); \ - out1 = __msa_pckev_b((v16i8) (in1_l), (v16i8) (in1_r)); \ - out2 = __msa_pckev_b((v16i8) (in2_l), (v16i8) (in2_r)); \ - out3 = __msa_pckev_b((v16i8) (in3_l), (v16i8) (in3_r)); \ + out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \ } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) -#define XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val) \ -{ \ - out0 = __msa_xori_b((v16u8) (val0), (xor_val)); \ - out1 = __msa_xori_b((v16u8) (val1), (xor_val)); \ +#define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ +{ \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \ } +#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__) -#define XORI_B_2VECS_SB(val0, val1, \ - out0, out1, xor_val) \ -{ \ - out0 = (v16i8) __msa_xori_b((v16u8) (val0), (xor_val)); \ - out1 = (v16i8) __msa_xori_b((v16u8) (val1), (xor_val)); \ -} - -#define XORI_B_3VECS_SB(val0, val1, val2, \ - out0, out1, out2, \ - xor_val) \ +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of in0 are copied to the left half of + out0 & even halfword elements of in1 are copied to the right + half of out0. + Even halfword elements of in2 are copied to the left half of + out1 & even halfword elements of in3 are copied to the right + half of out1. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ { \ - XORI_B_2VECS_SB(val0, val1, \ - out0, out1, xor_val); \ - out2 = (v16i8) __msa_xori_b((v16u8) (val2), (xor_val)); \ -} - -#define XORI_B_4VECS_UB(val0, val1, val2, val3, \ - out0, out1, out2, out3, xor_val) \ -{ \ - XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ - XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ + out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \ + out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \ } +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) -#define XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - xor_val) \ -{ \ - XORI_B_2VECS_SB(val0, val1, \ - out0, out1, xor_val); \ - XORI_B_2VECS_SB(val2, val3, \ - out2, out3, xor_val); \ -} - -#define XORI_B_5VECS_SB(val0, val1, val2, val3, val4, \ - out0, out1, out2, out3, out4, \ - xor_val) \ -{ \ - XORI_B_3VECS_SB(val0, val1, val2, \ - out0, out1, out2, xor_val); \ - XORI_B_2VECS_SB(val3, val4, \ - out3, out4, xor_val); \ +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in0, in1 (in-place) + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and result is in-place stored in + 'in0' vector + Each unsigned byte element from input vector 'in1' is + logically xor'ed with 128 and result is in-place stored in + 'in1' vector + Similar for other pairs +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ +{ \ + in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \ + in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \ +} +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ +{ \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \ +} +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ +{ \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ +} +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) +#define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__) + +#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \ +{ \ + XORI_B3_128(RTYPE, in0, in1, in2); \ + XORI_B2_128(RTYPE, in3, in4); \ } +#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) -#define XORI_B_6VECS_SB(val0, val1, val2, val3, val4, val5, \ - out0, out1, out2, out3, out4, out5, \ - xor_val) \ -{ \ - XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, xor_val); \ - XORI_B_2VECS_SB(val4, val5,out4, out5, xor_val); \ +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ +{ \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) -#define XORI_B_7VECS_SB(val0, val1, val2, val3, \ - val4, val5, val6, \ - out0, out1, out2, out3, \ - out4, out5, out6, \ - xor_val) \ -{ \ - XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, xor_val); \ - XORI_B_3VECS_SB(val4, val5, val6, \ - out4, out5, out6, xor_val); \ +#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \ +{ \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B4_128(RTYPE, in4, in5, in6, in7); \ +} +#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between -32768 to +32767 (as per halfword data type) + Similar for other pairs +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \ + out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \ } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) -#define XORI_B_8VECS_SB(val0, val1, val2, val3, \ - val4, val5, val6, val7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7, xor_val) \ -{ \ - XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, xor_val); \ - XORI_B_4VECS_SB(val4, val5, val6, val7, \ - out4, out5, out6, out7, xor_val); \ -} -#define ADDS_S_H_4VECS_UH(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) \ -{ \ - out0 = (v8u16) __msa_adds_s_h((v8i16) (in0), (v8i16) (in1)); \ - out1 = (v8u16) __msa_adds_s_h((v8i16) (in2), (v8i16) (in3)); \ - out2 = (v8u16) __msa_adds_s_h((v8i16) (in4), (v8i16) (in5)); \ - out3 = (v8u16) __msa_adds_s_h((v8i16) (in6), (v8i16) (in7)); \ -} -#define SRA_4VECS(in0, in1, in2, in3, \ - out0, out1, out2, out3, \ - shift_right_vec) \ +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + result is in place written to 'in0' + Similar for other pairs +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ +{ \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + result is in place written to 'in0' + Here, 'shift' is GP variable passed in + Similar for other pairs +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ { \ - out0 = (in0) >> (shift_right_vec); \ - out1 = (in1) >> (shift_right_vec); \ - out2 = (in2) >> (shift_right_vec); \ - out3 = (in3) >> (shift_right_vec); \ -} - -#define SRL_H_4VECS_UH(in0, in1, in2, in3, \ - out0, out1, out2, out3, \ - shift_right_vec) \ -{ \ - out0 = (v8u16) __msa_srl_h((v8i16) (in0), (v8i16) (shift_right_vec)); \ - out1 = (v8u16) __msa_srl_h((v8i16) (in1), (v8i16) (shift_right_vec)); \ - out2 = (v8u16) __msa_srl_h((v8i16) (in2), (v8i16) (shift_right_vec)); \ - out3 = (v8u16) __msa_srl_h((v8i16) (in3), (v8i16) (shift_right_vec)); \ -} - -#define SRAR_SATURATE_SIGNED_H(input, right_shift_vec, sat_val) \ -( { \ - v8i16 out_m; \ - \ - out_m = __msa_srar_h((v8i16) (input), (v8i16) (right_shift_vec)); \ - out_m = __msa_sat_s_h(out_m, (sat_val)); \ - out_m; \ -} ) - -#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \ - pdst, stride) \ -{ \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \ - \ - out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ - out1_m = __msa_copy_u_w((v4i32) tmp0_m, 1); \ - out2_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ - out3_m = __msa_copy_u_w((v4i32) tmp0_m, 3); \ - \ - STORE_WORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out3_m); \ -} - -#define PCKEV_B_XORI128_STORE_8_BYTES(in1, in2, pdest) \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ +} + +/* Description : Shift right logical all halfword elements of vector + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - unsigned halfword + Details : Each element of vector 'in0' is shifted right logical by + number of bits respective element holds in vector 'shift' and + result is in place written to 'in0' + Here, 'shift' is a vector passed in + Similar for other pairs +*/ +#define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \ +{ \ + in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \ + in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \ + in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \ + in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \ +} +#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and result is written to 'out0' + Similar for other pairs +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ +{ \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Inputs - in (1 input unsigned byte vector) + Outputs - out0, out1 (unsigned 2 halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ +{ \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ +} + +/* Description : Transposes input 4x4 byte block + Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block) + Outputs - out0, out1, out2, out3 (output 4x4 byte block) + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v16i8 zero_m = { 0 }; \ + v16i8 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \ + \ + out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \ + out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \ + out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \ + out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \ +} + +/* Description : Transposes input 8x4 byte block into 4x8 + Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block) + Outputs - out0, out1, out2, out3 (output 4x8 byte block) + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \ + tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ + ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \ + \ + tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \ + ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \ + \ + ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \ + out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ + out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ +} + +#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__) + +/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \ + tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \ + tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \ + tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \ + out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \ + tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \ + out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \ + tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + \ + tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ + tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \ + out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + \ + tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ + tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \ + tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ + tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \ + out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ + out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ +} + +/* Description : Pack even byte elements, extract 0 & 2 index words from pair + of results and store 4 words in destination memory as per + stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride +*/ +#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ { \ - uint64_t out_m; \ - v16i8 tmp_m; \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + v16i8 tmp0_m, tmp1_m; \ \ - tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \ - tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \ - out_m = __msa_copy_u_d((v2i64) tmp_m, 0); \ - STORE_DWORD((pdest), out_m); \ -} - -#define PCKEV_B_XORI128_STORE_8_BYTES_2(in1, in2, \ - pdst, stride) \ -{ \ - uint64_t out0_m, out1_m; \ - v16i8 tmp0_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \ - \ - out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ -} - -#define PCKEV_B_XORI128_STORE_6_BYTES_4(in1, in2, in3, in4, \ - pdst, stride) \ -{ \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - uint16_t out4_m, out5_m, out6_m, out7_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \ - \ - tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \ - tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \ - \ - out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ - out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ - out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \ - out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \ - \ - out4_m = __msa_copy_u_h((v8i16) tmp0_m, 2); \ - out5_m = __msa_copy_u_h((v8i16) tmp0_m, 6); \ - out6_m = __msa_copy_u_h((v8i16) tmp1_m, 2); \ - out7_m = __msa_copy_u_h((v8i16) tmp1_m, 6); \ - \ - STORE_WORD(dst_m, out0_m); \ - STORE_HWORD((dst_m + 4), out4_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out1_m); \ - STORE_HWORD((dst_m + 4), out5_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out2_m); \ - STORE_HWORD((dst_m + 4), out6_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out3_m); \ - STORE_HWORD((dst_m + 4), out7_m); \ -} - -#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, in3, in4, \ - pdst, stride) \ -{ \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \ - \ - tmp0_m = (v16i8) __msa_xori_b((v16u8) tmp0_m, 128); \ - tmp1_m = (v16i8) __msa_xori_b((v16u8) tmp1_m, 128); \ - \ - out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} -#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) \ -{ \ - v16i8 tmp_m; \ + PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ \ - tmp_m = __msa_pckev_b((v16i8) (in1), (v16i8) (in2)); \ - tmp_m = (v16i8) __msa_xori_b((v16u8) tmp_m, 128); \ - STORE_SB(tmp_m, (pdest)); \ -} - -#define PCKEV_B_STORE_4_BYTES_4(in1, in2, in3, in4, \ - pdst, stride) \ -{ \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \ - \ - out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ - out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ - out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \ - out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \ - \ - STORE_WORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out3_m); \ -} - -#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \ - pdst, stride) \ -{ \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *) (pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8) (in2), (v16i8) (in1)); \ - tmp1_m = __msa_pckev_b((v16i8) (in4), (v16i8) (in3)); \ - \ - out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64) tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64) tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} - -#define UNPCK_SIGNED_B_TO_H(in, out1, out2) \ -{ \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_clti_s_b((v16i8) (in), 0); \ - out1 = (v8i16) __msa_ilvr_b(tmp_m, (v16i8) (in)); \ - out2 = (v8i16) __msa_ilvl_b(tmp_m, (v16i8) (in)); \ -} - -#define SWAP_VECS(Vec0, Vec1) \ -{ \ - Vec0 = Vec0 ^ Vec1; \ - Vec1 = Vec0 ^ Vec1; \ - Vec0 = Vec0 ^ Vec1; \ + out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \ + out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \ + out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \ + out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ } - #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */