@ -291,6 +291,7 @@
LD_B2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
out2 = LD_B ( RTYPE , ( psrc ) + 2 * stride ) ; \
}
# define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
# define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
# define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
@ -573,6 +574,18 @@
SH ( out7_m , ( pblk_6x4_m + 4 ) ) ; \
}
/* Description : Store as 8x1 byte block to destination memory from input vector
Arguments : Inputs - in , pdst
Details : Index 0 double word element from input vector ' in ' is copied
and stored to destination memory at ( pdst )
*/
# define ST8x1_UB(in, pdst) \
{ \
uint64_t out0_m ; \
out0_m = __msa_copy_u_d ( ( v2i64 ) in , 0 ) ; \
SD ( out0_m , pdst ) ; \
}
/* Description : Store as 8x2 byte block to destination memory from input vector
Arguments : Inputs - in , pdst , stride
Details : Index 0 double word element from input vector ' in ' is copied
@ -716,6 +729,23 @@
}
# define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
/* Description : Immediate number of columns to slide
Arguments : Inputs - in0_0 , in0_1 , in1_0 , in1_1 , slide_val
Outputs - out0 , out1
Return Type - as per RTYPE
Details : Byte elements from ' in0_0 ' vector are slide into ' in1_0 ' by
number of elements specified by ' slide_val '
*/
# define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
{ \
out0 = ( RTYPE ) __msa_sldi_b ( ( v16i8 ) in0_0 , ( v16i8 ) in1_0 , slide_val ) ; \
out1 = ( RTYPE ) __msa_sldi_b ( ( v16i8 ) in0_1 , ( v16i8 ) in1_1 , slide_val ) ; \
}
# define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
# define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
# define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0 , in1 , in2 , in3 , mask0 , mask1
Outputs - out0 , out1
@ -1090,6 +1120,16 @@
# define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
# define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
# define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
# define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
# define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
{ \
ILVR_B2 ( RTYPE , in0 , in1 , in2 , in3 , out0 , out1 ) ; \
out2 = ( RTYPE ) __msa_ilvr_b ( ( v16i8 ) in4 , ( v16i8 ) in5 ) ; \
}
# define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
# define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
# define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
# define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0 , out1 , out2 , out3 ) \
@ -1306,6 +1346,7 @@
out0 = ( RTYPE ) __msa_splati_h ( ( v8i16 ) in , idx0 ) ; \
out1 = ( RTYPE ) __msa_splati_h ( ( v8i16 ) in , idx1 ) ; \
}
# define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
# define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
# define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
@ -1427,7 +1468,9 @@
in0 = ( RTYPE ) __msa_xori_b ( ( v16u8 ) in0 , 128 ) ; \
in1 = ( RTYPE ) __msa_xori_b ( ( v16u8 ) in1 , 128 ) ; \
}
# define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
# define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
# define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
# define XORI_B3_128(RTYPE, in0, in1, in2) \
{ \
@ -1628,6 +1671,14 @@
# define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
# define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
# define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRARI_H2 ( RTYPE , in0 , in1 , shift ) ; \
SRARI_H2 ( RTYPE , in2 , in3 , shift ) ; \
}
# define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
# define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0 , in1 , shift
Outputs - in0 , in1 ( in place )