@ -27,202 +27,163 @@
# define ALIGNMENT 16
# define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
# define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
# define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
# define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
# define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
# define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
# define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
# define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
# define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
# define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
# define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
# define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
# define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
# define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
# define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
# define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
# define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
# define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
# define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
# define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
# define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
# define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
# define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
# define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
# define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
# define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
# define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
# define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
# define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
# define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
# define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
# define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
# define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
# if (__mips_isa_rev >= 6)
# define LW(psrc) \
( { \
uint8_t * psrc_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val_m ; \
\
__asm__ volatile ( \
" lw %[val_m], %[psrc_m] \n \t " \
\
: [ val_m ] " =r " ( val_m ) \
: [ psrc_m ] " m " ( * psrc_m ) \
) ; \
\
val_m ; \
# define LH(psrc) \
( { \
uint16_t val_lh_m = * ( uint16_t * ) ( psrc ) ; \
val_lh_m ; \
} )
# define LW(psrc) \
( { \
uint32_t val_lw_m = * ( uint32_t * ) ( psrc ) ; \
val_lw_m ; \
} )
# if (__mips == 64)
# define LD(psrc) \
( { \
uint8_t * psrc_m = ( uint8_t * ) ( psrc ) ; \
uint64_t val_m = 0 ; \
\
__asm__ volatile ( \
" ld %[val_m], %[psrc_m] \n \t " \
\
: [ val_m ] " =r " ( val_m ) \
: [ psrc_m ] " m " ( * psrc_m ) \
) ; \
\
val_m ; \
# define LD(psrc) \
( { \
uint64_t val_ld_m = * ( uint64_t * ) ( psrc ) ; \
val_ld_m ; \
} )
# else // !(__mips == 64)
# define LD(psrc) \
( { \
uint8_t * psrc_ld_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val0_m , val1_m ; \
uint64_t val_m = 0 ; \
\
val0_m = LW ( psrc_ld_m ) ; \
val1_m = LW ( psrc_ld_m + 4 ) ; \
\
val_m = ( uint64_t ) ( val1_m ) ; \
val_m = ( uint64_t ) ( ( val_m < < 32 ) & 0xFFFFFFFF00000000 ) ; \
val_m = ( uint64_t ) ( val_m | ( uint64_t ) val0_m ) ; \
\
val_m ; \
# define LD(psrc) \
( { \
uint8_t * psrc_ld_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val0_ld_m , val1_ld_m ; \
uint64_t val_ld_m = 0 ; \
\
val0_ld_m = LW ( psrc_ld_m ) ; \
val1_ld_ m = LW ( psrc_ld_m + 4 ) ; \
\
val_ld_ m = ( uint64_t ) ( val1_ld _m ) ; \
val_ld_ m = ( uint64_t ) ( ( val_ld _m < < 32 ) & 0xFFFFFFFF00000000 ) ; \
val_ld_ m = ( uint64_t ) ( val_ld _m | ( uint64_t ) val0_ld_ m ) ; \
\
val_ld_ m ; \
} )
# endif // (__mips == 64)
# define SH(val, pdst) \
{ \
uint8_t * pdst_m = ( uint8_t * ) ( pdst ) ; \
uint16_t val_m = ( val ) ; \
\
__asm__ volatile ( \
" sh %[val_m], %[pdst_m] \n \t " \
\
: [ pdst_m ] " =m " ( * pdst_m ) \
: [ val_m ] " r " ( val_m ) \
) ; \
}
# define SW(val, pdst) \
{ \
uint8_t * pdst_m = ( uint8_t * ) ( pdst ) ; \
uint32_t val_m = ( val ) ; \
\
__asm__ volatile ( \
" sw %[val_m], %[pdst_m] \n \t " \
\
: [ pdst_m ] " =m " ( * pdst_m ) \
: [ val_m ] " r " ( val_m ) \
) ; \
}
# define SH(val, pdst) *(uint16_t *)(pdst) = (val);
# define SW(val, pdst) *(uint32_t *)(pdst) = (val);
# define SD(val, pdst) *(uint64_t *)(pdst) = (val);
# define SD(val, pdst) \
{ \
uint8_t * pdst_m = ( uint8_t * ) ( pdst ) ; \
uint64_t val_m = ( val ) ; \
\
__asm__ volatile ( \
" sd %[val_m], %[pdst_m] \n \t " \
\
: [ pdst_m ] " =m " ( * pdst_m ) \
: [ val_m ] " r " ( val_m ) \
) ; \
}
# else // !(__mips_isa_rev >= 6)
# define LW(psrc) \
( { \
uint8_t * psrc_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val_m ; \
\
__asm__ volatile ( \
" ulw %[val_m], %[psrc_m] \n \t " \
\
: [ val_m ] " =r " ( val_m ) \
: [ psrc_m ] " m " ( * psrc_m ) \
) ; \
\
val_m ; \
# define LH(psrc) \
( { \
uint8_t * psrc_lh_m = ( uint8_t * ) ( psrc ) ; \
uint16_t val_lh_m ; \
\
__asm__ volatile ( \
" ulh %[val_lh_m], %[psrc_lh_m] \n \t " \
\
: [ val_lh_m ] " =r " ( val_lh_m ) \
: [ psrc_lh_m ] " m " ( * psrc_lh_m ) \
) ; \
\
val_lh_m ; \
} )
# define LW(psrc) \
( { \
uint8_t * psrc_lw_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val_lw_m ; \
\
__asm__ volatile ( \
" ulw %[val_lw_m], %[psrc_lw_m] \n \t " \
\
: [ val_lw_m ] " =r " ( val_lw_m ) \
: [ psrc_lw_m ] " m " ( * psrc_lw_m ) \
) ; \
\
val_lw_m ; \
} )
# if (__mips == 64)
# define LD(psrc) \
( { \
uint8_t * psrc_m = ( uint8_t * ) ( psrc ) ; \
uint64_t val_m = 0 ; \
\
__asm__ volatile ( \
" uld %[val_m], %[psrc_m] \n \t " \
\
: [ val_m ] " =r " ( val_m ) \
: [ psrc_m ] " m " ( * psrc_m ) \
) ; \
\
val_m ; \
# define LD(psrc) \
( { \
uint8_t * psrc_ld_ m = ( uint8_t * ) ( psrc ) ; \
uint64_t val_ld_ m = 0 ; \
\
__asm__ volatile ( \
" uld %[val_ld_ m], %[psrc_ld _m] \n \t " \
\
: [ val_ld_ m ] " =r " ( val_ld _m ) \
: [ psrc_ld_ m ] " m " ( * psrc_ld _m ) \
) ; \
\
val_ld_ m ; \
} )
# else // !(__mips == 64)
# define LD(psrc) \
( { \
uint8_t * psrc_ld_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val0_m , val1_m ; \
uint64_t val_m = 0 ; \
\
val0_m = LW ( psrc_ld_m ) ; \
val1_m = LW ( psrc_ld_m + 4 ) ; \
\
val_m = ( uint64_t ) ( val1_m ) ; \
val_m = ( uint64_t ) ( ( val_m < < 32 ) & 0xFFFFFFFF00000000 ) ; \
val_m = ( uint64_t ) ( val_m | ( uint64_t ) val0_m ) ; \
\
val_m ; \
# define LD(psrc) \
( { \
uint8_t * psrc_ld_m = ( uint8_t * ) ( psrc ) ; \
uint32_t val0_ld_ m , val1_ld _m ; \
uint64_t val_ld_ m = 0 ; \
\
val0_ld_ m = LW ( psrc_ld_m ) ; \
val1_ld_ m = LW ( psrc_ld_m + 4 ) ; \
\
val_ld_ m = ( uint64_t ) ( val1_ld _m ) ; \
val_ld_ m = ( uint64_t ) ( ( val_ld _m < < 32 ) & 0xFFFFFFFF00000000 ) ; \
val_ld_ m = ( uint64_t ) ( val_ld _m | ( uint64_t ) val0_ld_ m ) ; \
\
val_ld_ m ; \
} )
# endif // (__mips == 64)
# define SH(val, pdst) \
{ \
uint8_t * pdst_m = ( uint8_t * ) ( pdst ) ; \
uint16_t val_m = ( val ) ; \
\
__asm__ volatile ( \
" ush %[val_m], %[pdst_m] \n \t " \
\
: [ pdst_m ] " =m " ( * pdst_m ) \
: [ val_m ] " r " ( val_m ) \
) ; \
# define SH(val, pdst) \
{ \
uint8_t * pdst_sh_ m = ( uint8_t * ) ( pdst ) ; \
uint16_t val_sh_ m = ( val ) ; \
\
__asm__ volatile ( \
" ush %[val_sh_ m], %[pdst_sh _m] \n \t " \
\
: [ pdst_sh_ m ] " =m " ( * pdst_sh _m ) \
: [ val_sh_ m ] " r " ( val_sh _m ) \
) ; \
}
# define SW(val, pdst) \
{ \
uint8_t * pdst_m = ( uint8_t * ) ( pdst ) ; \
uint32_t val_m = ( val ) ; \
\
__asm__ volatile ( \
" usw %[val_m], %[pdst_m] \n \t " \
\
: [ pdst_m ] " =m " ( * pdst_m ) \
: [ val_m ] " r " ( val_m ) \
) ; \
# define SW(val, pdst) \
{ \
uint8_t * pdst_sw_ m = ( uint8_t * ) ( pdst ) ; \
uint32_t val_sw_ m = ( val ) ; \
\
__asm__ volatile ( \
" usw %[val_sw_ m], %[pdst_sw _m] \n \t " \
\
: [ pdst_sw_ m ] " =m " ( * pdst_sw _m ) \
: [ val_sw_ m ] " r " ( val_sw _m ) \
) ; \
}
# define SD(val, pdst) \
{ \
uint8_t * pdst_m1 = ( uint8_t * ) ( pdst ) ; \
uint32_t val0_m , val1_m ; \
\
val0_m = ( uint32_t ) ( ( val ) & 0x00000000FFFFFFFF ) ; \
val1_m = ( uint32_t ) ( ( ( val ) > > 32 ) & 0x00000000FFFFFFFF ) ; \
\
SW ( val0_m , pdst_m1 ) ; \
SW ( val1_m , pdst_m1 + 4 ) ; \
# define SD(val, pdst) \
{ \
uint8_t * pdst_sd_ m = ( uint8_t * ) ( pdst ) ; \
uint32_t val0_sd_ m , val1_sd_ m ; \
\
val0_sd_ m = ( uint32_t ) ( ( val ) & 0x00000000FFFFFFFF ) ; \
val1_sd_ m = ( uint32_t ) ( ( ( val ) > > 32 ) & 0x00000000FFFFFFFF ) ; \
\
SW ( val0_sd_ m , pdst_sd_ m ) ; \
SW ( val1_sd_ m , pdst_sd_ m + 4 ) ; \
}
# endif // (__mips_isa_rev >= 6)
@ -291,122 +252,91 @@
SD ( in3 , ( pdst ) + 3 * stride ) ; \
}
/* Description : Load vectors with 16 byte elements with stride
/* Description : Load vector elements with stride
Arguments : Inputs - psrc ( source pointer to load from )
- stride
Outputs - out0 , out1
Return Type - as per RTYPE
Details : Loads 16 byte elements in ' out0 ' from ( psrc )
Loads 16 byte elements in ' out1 ' from ( psrc + stride )
Details : Loads elements in ' out0 ' from ( psrc )
Loads elements in ' out1 ' from ( psrc + stride )
*/
# define LD_B 2(RTYPE, psrc, stride, out0, out1) \
# define LD_V 2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_B ( RTYPE , ( psrc ) ) ; \
out1 = LD_B ( RTYPE , ( psrc ) + stride ) ; \
out0 = LD_V ( RTYPE , ( psrc ) ) ; \
out1 = LD_V ( RTYPE , ( psrc ) + stride ) ; \
}
# define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
# define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
# define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
# define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
# define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
# define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
# define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
# define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
# define LD_V 3(RTYPE, psrc, stride, out0, out1, out2) \
{ \
LD_B 2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
out2 = LD_B ( RTYPE , ( psrc ) + 2 * stride ) ; \
LD_V 2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
out2 = LD_V ( RTYPE , ( psrc ) + 2 * stride ) ; \
}
# define LD_UB3(...) LD_B 3(v16u8, __VA_ARGS__)
# define LD_SB3(...) LD_B 3(v16i8, __VA_ARGS__)
# define LD_UB3(...) LD_V 3(v16u8, __VA_ARGS__)
# define LD_SB3(...) LD_V 3(v16i8, __VA_ARGS__)
# define LD_B 4(RTYPE, psrc, stride, out0, out1, out2, out3) \
# define LD_V 4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_B 2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
LD_B 2 ( RTYPE , ( psrc ) + 2 * stride , stride , out2 , out3 ) ; \
LD_V 2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
LD_V 2 ( RTYPE , ( psrc ) + 2 * stride , stride , out2 , out3 ) ; \
}
# define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
# define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
# define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
# define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
# define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
# define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
# define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
# define LD_V 5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
{ \
LD_B 4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
out4 = LD_B ( RTYPE , ( psrc ) + 4 * stride ) ; \
LD_V 4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
out4 = LD_V ( RTYPE , ( psrc ) + 4 * stride ) ; \
}
# define LD_UB5(...) LD_B 5(v16u8, __VA_ARGS__)
# define LD_SB5(...) LD_B 5(v16i8, __VA_ARGS__)
# define LD_UB5(...) LD_V 5(v16u8, __VA_ARGS__)
# define LD_SB5(...) LD_V 5(v16i8, __VA_ARGS__)
# define LD_B 6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
# define LD_V 6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
{ \
LD_B 4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_B 2 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 ) ; \
LD_V 4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_V 2 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 ) ; \
}
# define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
# define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
# define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
# define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
# define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
# define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
# define LD_B7(RTYPE, psrc, stride, \
# define LD_V 7(RTYPE, psrc, stride, \
out0 , out1 , out2 , out3 , out4 , out5 , out6 ) \
{ \
LD_B 5 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 , out4 ) ; \
LD_B 2 ( RTYPE , ( psrc ) + 5 * stride , stride , out5 , out6 ) ; \
LD_V 5 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 , out4 ) ; \
LD_V 2 ( RTYPE , ( psrc ) + 5 * stride , stride , out5 , out6 ) ; \
}
# define LD_UB7(...) LD_B 7(v16u8, __VA_ARGS__)
# define LD_SB7(...) LD_B 7(v16i8, __VA_ARGS__)
# define LD_UB7(...) LD_V 7(v16u8, __VA_ARGS__)
# define LD_SB7(...) LD_V 7(v16i8, __VA_ARGS__)
# define LD_B 8(RTYPE, psrc, stride, \
# define LD_V 8(RTYPE, psrc, stride, \
out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7 ) \
{ \
LD_B4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_B4 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 , out6 , out7 ) ; \
}
# define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
# define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc ( source pointer to load from )
- stride
Outputs - out0 , out1
Details : Loads 8 halfword elements in ' out0 ' from ( psrc )
Loads 8 halfword elements in ' out1 ' from ( psrc + stride )
*/
# define LD_H2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_H ( RTYPE , ( psrc ) ) ; \
out1 = LD_H ( RTYPE , ( psrc ) + ( stride ) ) ; \
}
# define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
# define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
# define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_H2 ( RTYPE , ( psrc ) , stride , out0 , out1 ) ; \
LD_H2 ( RTYPE , ( psrc ) + 2 * stride , stride , out2 , out3 ) ; \
}
# define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
# define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
# define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
{ \
LD_H4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_H2 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 ) ; \
LD_V4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_V4 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 , out6 , out7 ) ; \
}
# define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
# define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
# define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
# define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
# define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
# define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
# define LD_H8(RTYPE, psrc, stride, \
out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7 ) \
{ \
LD_H4 ( RTYPE , ( psrc ) , stride , out0 , out1 , out2 , out3 ) ; \
LD_H4 ( RTYPE , ( psrc ) + 4 * stride , stride , out4 , out5 , out6 , out7 ) ; \
}
# define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
# define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
# define LD_H16(RTYPE, psrc, stride, \
# define LD_V16(RTYPE, psrc, stride, \
out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7 , \
out8 , out9 , out10 , out11 , out12 , out13 , out14 , out15 ) \
{ \
LD_H 8 ( RTYPE , ( psrc ) , stride , \
LD_V8 ( RTYPE , ( psrc ) , stride , \
out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7 ) ; \
LD_H 8 ( RTYPE , ( psrc ) + 8 * stride , stride , \
LD_V8 ( RTYPE , ( psrc ) + 8 * stride , stride , \
out8 , out9 , out10 , out11 , out12 , out13 , out14 , out15 ) ; \
}
# define LD_SH16(...) LD_H 16(v8i16, __VA_ARGS__)
# define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
/* Description : Load as 4x4 block of signed halfword elements from 1D source
data into 4 vectors ( Each vector with 4 signed halfwords )
@ -421,103 +351,48 @@
out3 = ( v8i16 ) __msa_ilvl_d ( ( v2i64 ) out2 , ( v2i64 ) out2 ) ; \
}
/* Description : Load 2 vectors of signed word elements with stride
Arguments : Inputs - psrc ( source pointer to load from )
- stride
Outputs - out0 , out1
Return Type - signed word
*/
# define LD_SW2(psrc, stride, out0, out1) \
{ \
out0 = LD_SW ( ( psrc ) ) ; \
out1 = LD_SW ( ( psrc ) + stride ) ; \
}
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0 , in1 , stride
Outputs - pdst ( destination pointer to store to )
Details : Stores 16 byte elements from ' in0 ' to ( pdst )
Stores 16 byte elements from ' in1 ' to ( pdst + stride )
*/
# define ST_B2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_B ( RTYPE , in0 , ( pdst ) ) ; \
ST_B ( RTYPE , in1 , ( pdst ) + stride ) ; \
}
# define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
# define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
# define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_B2 ( RTYPE , in0 , in1 , ( pdst ) , stride ) ; \
ST_B2 ( RTYPE , in2 , in3 , ( pdst ) + 2 * stride , stride ) ; \
}
# define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
# define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
# define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
pdst , stride ) \
{ \
ST_B4 ( RTYPE , in0 , in1 , in2 , in3 , pdst , stride ) ; \
ST_B4 ( RTYPE , in4 , in5 , in6 , in7 , ( pdst ) + 4 * stride , stride ) ; \
}
# define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
/* Description : Store vectors with stride
Arguments : Inputs - in0 , in1 , stride
Outputs - pdst ( destination pointer to store to )
Details : Stores 8 halfword elements from ' in0 ' to ( pdst )
Stores 8 halfword elements from ' in1 ' to ( pdst + stride )
Details : Stores elements from ' in0 ' to ( pdst )
Stores elements from ' in1 ' to ( pdst + stride )
*/
# define ST_H 2(RTYPE, in0, in1, pdst, stride) \
# define ST_V2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_H ( RTYPE , in0 , ( pdst ) ) ; \
ST_H ( RTYPE , in1 , ( pdst ) + stride ) ; \
ST_V ( RTYPE , in0 , ( pdst ) ) ; \
ST_V ( RTYPE , in1 , ( pdst ) + stride ) ; \
}
# define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
# define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
# define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
# define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
# define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
# define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
# define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
# define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
# define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_H 2 ( RTYPE , in0 , in1 , ( pdst ) , stride ) ; \
ST_H 2 ( RTYPE , in2 , in3 , ( pdst ) + 2 * stride , stride ) ; \
ST_V2 ( RTYPE , in0 , in1 , ( pdst ) , stride ) ; \
ST_V2 ( RTYPE , in2 , in3 , ( pdst ) + 2 * stride , stride ) ; \
}
# define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
# define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
# define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
# define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
# define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
# define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
# define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
{ \
ST_H 4 ( RTYPE , in0 , in1 , in2 , in3 , ( pdst ) , stride ) ; \
ST_H 2 ( RTYPE , in4 , in5 , ( pdst ) + 4 * stride , stride ) ; \
ST_V4 ( RTYPE , in0 , in1 , in2 , in3 , ( pdst ) , stride ) ; \
ST_V2 ( RTYPE , in4 , in5 , ( pdst ) + 4 * stride , stride ) ; \
}
# define ST_SH6(...) ST_H 6(v8i16, __VA_ARGS__)
# define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
# define ST_H 8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
# define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
{ \
ST_H4 ( RTYPE , in0 , in1 , in2 , in3 , ( pdst ) , stride ) ; \
ST_H4 ( RTYPE , in4 , in5 , in6 , in7 , ( pdst ) + 4 * stride , stride ) ; \
}
# define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
/* Description : Store vectors of word elements with stride
Arguments : Inputs - in0 , in1 , stride
Outputs - pdst ( destination pointer to store to )
Return Type - signed word
Details : Stores 4 word elements from ' in0 ' to ( pdst )
Stores 4 word elements from ' in1 ' to ( pdst + stride )
*/
# define ST_SW2(in0, in1, pdst, stride) \
{ \
ST_SW ( in0 , ( pdst ) ) ; \
ST_SW ( in1 , ( pdst ) + stride ) ; \
}
# define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \
pdst , stride ) \
{ \
ST_SW2 ( in0 , in1 , ( pdst ) , stride ) ; \
ST_SW2 ( in2 , in3 , ( pdst ) + 2 * stride , stride ) ; \
ST_SW2 ( in4 , in5 , ( pdst ) + 4 * stride , stride ) ; \
ST_SW2 ( in6 , in7 , ( pdst ) + 6 * stride , stride ) ; \
ST_V4 ( RTYPE , in0 , in1 , in2 , in3 , ( pdst ) , stride ) ; \
ST_V4 ( RTYPE , in4 , in5 , in6 , in7 , ( pdst ) + 4 * stride , stride ) ; \
}
# define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
# define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
# define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
/* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in , stidx , pdst , stride
@ -776,7 +651,7 @@
/* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0 , in1 , in2 , in3 ,
Outputs - out0 , out1
Return Type - signed byte
Return Type - as per RTYPE
Details : Each byte element from ' in0 ' vector is added with each byte
element from ' in1 ' vector . The addition of the elements plus 1
( for rounding ) is done unsigned with full precision ,
@ -941,7 +816,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Unsigned byte elements from mult0 are multiplied with
unsigned byte elements from cnst0 producing a result
twice the size of input i . e . unsigned halfword .
@ -969,7 +844,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - signed halfword
Return Type - as per RTYPE
Details : Signed byte elements from mult0 are multiplied with
signed byte elements from cnst0 producing a result
twice the size of input i . e . signed halfword .
@ -1004,7 +879,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - signed word
Return Type - as per RTYPE
Details : Signed halfword elements from mult0 are multiplied with
signed halfword elements from cnst0 producing a result
twice the size of input i . e . signed word .
@ -1032,7 +907,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - signed halfword
Return Type - as per RTYPE
Details : Signed byte elements from mult0 are multiplied with
signed byte elements from cnst0 producing a result
twice the size of input i . e . signed halfword .
@ -1061,7 +936,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Unsigned byte elements from mult0 are multiplied with
unsigned byte elements from cnst0 producing a result
twice the size of input i . e . unsigned halfword .
@ -1082,7 +957,7 @@
Arguments : Inputs - mult0 , mult1
cnst0 , cnst1
Outputs - out0 , out1
Return Type - signed word
Return Type - as per RTYPE
Details : Signed halfword elements from mult0 are multiplied with
signed halfword elements from cnst0 producing a result
twice the size of input i . e . signed word .
@ -1111,7 +986,7 @@
either vector are copied to the output vector
Arguments : Inputs - in0 , in1 , min_vec
Outputs - in0 , in1 , ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Minimum of unsigned halfword element values from ' in0 ' and
' min_value ' are written to output vector ' in0 '
*/
@ -1202,7 +1077,7 @@
\
res0_m = __msa_hadd_s_d ( ( v4i32 ) in , ( v4i32 ) in ) ; \
res1_m = __msa_splati_d ( res0_m , 1 ) ; \
res0_m = res0_m + res1_m ; \
res0_m + = res1_m ; \
sum_m = __msa_copy_s_w ( ( v4i32 ) res0_m , 0 ) ; \
sum_m ; \
} )
@ -1223,7 +1098,7 @@
res_m = __msa_hadd_u_w ( ( v8u16 ) in , ( v8u16 ) in ) ; \
res0_m = __msa_hadd_u_d ( res_m , res_m ) ; \
res1_m = ( v2u64 ) __msa_splati_d ( ( v2i64 ) res0_m , 1 ) ; \
res0_m = res0_m + res1_m ; \
res0_m + = res1_m ; \
sum_m = __msa_copy_u_w ( ( v4i32 ) res0_m , 0 ) ; \
sum_m ; \
} )
@ -1573,7 +1448,7 @@
/* Description : Interleave right half of halfword elements from vectors
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7
Outputs - out0 , out1 , out2 , out3
Return Type - signed halfword
Return Type - as per RTYPE
Details : Right half of halfword elements of in0 and right half of
halfword elements of in1 are interleaved and copied to out0 .
Right half of halfword elements of in2 and right half of
@ -1625,16 +1500,16 @@
/* Description : Interleave right half of double word elements from vectors
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7
Outputs - out0 , out1 , out2 , out3
Return Type - unsigned double word
Return Type - as per RTYPE
Details : Right half of double word elements of in0 and right half of
double word elements of in1 are interleaved and copied to out0 .
Right half of double word elements of in2 and right half of
double word elements of in3 are interleaved and copied to out1 .
*/
# define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) ( in0 ) , ( v2i64 ) ( in1 ) ) ; \
out1 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) ( in2 ) , ( v2i64 ) ( in3 ) ) ; \
# define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) in0 , ( v2i64 ) in1 ) ; \
out1 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) in2 , ( v2i64 ) in3 ) ; \
}
# define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
# define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
@ -1643,7 +1518,7 @@
# define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
{ \
ILVR_D2 ( RTYPE , in0 , in1 , in2 , in3 , out0 , out1 ) ; \
out2 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) ( in4 ) , ( v2i64 ) ( in5 ) ) ; \
out2 = ( RTYPE ) __msa_ilvr_d ( ( v2i64 ) in4 , ( v2i64 ) in5 ) ; \
}
# define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
@ -1698,14 +1573,14 @@
5 - bit signed immediate value are copied to the output vector
Arguments : Inputs - in0 , in1 , in2 , in3 , max_val
Outputs - in0 , in1 , in2 , in3 ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Maximum of signed halfword element values from ' in0 ' and
' max_val ' are written to output vector ' in0 '
*/
# define MAXI_SH2(RTYPE, in0, in1, max_val) \
{ \
in0 = ( RTYPE ) __msa_maxi_s_h ( ( v8i16 ) in0 , ( max_val ) ) ; \
in1 = ( RTYPE ) __msa_maxi_s_h ( ( v8i16 ) in1 , ( max_val ) ) ; \
# define MAXI_SH2(RTYPE, in0, in1, max_val) \
{ \
in0 = ( RTYPE ) __msa_maxi_s_h ( ( v8i16 ) in0 , max_val ) ; \
in1 = ( RTYPE ) __msa_maxi_s_h ( ( v8i16 ) in1 , max_val ) ; \
}
# define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
# define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
@ -1722,7 +1597,7 @@
The element data width remains unchanged
Arguments : Inputs - in0 , in1 , in2 , in3 , sat_val
Outputs - in0 , in1 , in2 , in3 ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Each unsigned halfword element from ' in0 ' is saturated to the
value generated with ( sat_val + 1 ) bit range
Results are in placed to original vectors
@ -1738,7 +1613,7 @@
# define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
{ \
SAT_UH2 ( RTYPE , in0 , in1 , sat_val ) ; \
SAT_UH2 ( RTYPE , in2 , in3 , sat_val ) \
SAT_UH2 ( RTYPE , in2 , in3 , sat_val ) ; \
}
# define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
@ -1747,7 +1622,7 @@
The element data width remains unchanged
Arguments : Inputs - in0 , in1 , in2 , in3 , sat_val
Outputs - in0 , in1 , in2 , in3 ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Each unsigned halfword element from ' in0 ' is saturated to the
value generated with ( sat_val + 1 ) bit range
Results are in placed to original vectors
@ -1761,7 +1636,7 @@
# define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
{ \
SAT_SH2 ( RTYPE , in0 , in1 , sat_val ) \
SAT_SH2 ( RTYPE , in0 , in1 , sat_val ) ; \
in2 = ( RTYPE ) __msa_sat_s_h ( ( v8i16 ) in2 , sat_val ) ; \
}
# define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
@ -1778,7 +1653,7 @@
The element data width remains unchanged
Arguments : Inputs - in0 , in1 , in2 , in3 , sat_val
Outputs - in0 , in1 , in2 , in3 ( in place )
Return Type - unsigned word
Return Type - as per RTYPE
Details : Each unsigned word element from ' in0 ' is saturated to the
value generated with ( sat_val + 1 ) bit range
Results are in placed to original vectors
@ -1930,7 +1805,7 @@
/* Description : Pack even double word elements of vector pairs
Arguments : Inputs - in0 , in1 , in2 , in3
Outputs - out0 , out1
Return Type - unsigned byte
Return Type - as per RTYPE
Details : Even double elements of in0 are copied to the left half of
out0 & even double elements of in1 are copied to the right
half of out0 .
@ -2100,7 +1975,7 @@
/* Description : Shift right logical all halfword elements of vector
Arguments : Inputs - in0 , in1 , in2 , in3 , shift
Outputs - in0 , in1 , in2 , in3 ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Each element of vector ' in0 ' is shifted right logical by
number of bits respective element holds in vector ' shift ' and
result is in place written to ' in0 '
@ -2119,7 +1994,7 @@
/* Description : Shift right arithmetic rounded halfwords
Arguments : Inputs - in0 , in1 , shift
Outputs - in0 , in1 , ( in place )
Return Type - unsigned halfword
Return Type - as per RTYPE
Details : Each element of vector ' in0 ' is shifted right arithmetic by
number of bits respective element holds in vector ' shift ' .
The last discarded bit is added to shifted value for rounding
@ -2445,7 +2320,7 @@
/* Description : Transposes input 8x4 byte block into 4x8
Arguments : Inputs - in0 , in1 , in2 , in3 ( input 8 x4 byte block )
Outputs - out0 , out1 , out2 , out3 ( output 4 x8 byte block )
Return Type - unsigned byte
Return Type - as per RTYPE
Details :
*/
# define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
@ -2472,7 +2347,7 @@
( input 8 x8 byte block )
Outputs - out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7
( output 8 x8 byte block )
Return Type - unsigned byte
Return Type - as per RTYPE
Details :
*/
# define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
@ -2596,7 +2471,7 @@
/* Description : Transposes 8x8 block with half word elements in vectors
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7
Outputs - out0 , out1 , out2 , out3 , out4 , out5 , out6 , out7
Return Type - signed halfword
Return Type - as per RTYPE
Details :
*/
# define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
@ -2646,8 +2521,6 @@
/* Description : Average byte elements from pair of vectors and store 8x4 byte
block in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
averaged ( a + b ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2679,8 +2552,6 @@
/* Description : Average byte elements from pair of vectors and store 16x4 byte
block in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
averaged ( a + b ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2707,8 +2578,6 @@
/* Description : Average rounded byte elements from pair of vectors and store
8 x4 byte block in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
average rounded ( a + b + 1 ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2738,8 +2607,6 @@
/* Description : Average rounded byte elements from pair of vectors and store
16 x4 byte block in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
average rounded ( a + b + 1 ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2764,8 +2631,6 @@
average rounded with destination and store 8 x4 byte block
in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
average rounded ( a + b + 1 ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2794,8 +2659,6 @@
average rounded with destination and store 16 x4 byte block
in destination memory
Arguments : Inputs - in0 , in1 , in2 , in3 , in4 , in5 , in6 , in7 , pdst , stride
Outputs -
Return Type -
Details : Each byte element from input vector pair ' in0 ' and ' in1 ' are
average rounded ( a + b + 1 ) / 2 and stored in ' tmp0_m '
Each byte element from input vector pair ' in2 ' and ' in3 ' are
@ -2822,8 +2685,6 @@
/* Description : Add block 4x4
Arguments : Inputs - in0 , in1 , in2 , in3 , pdst , stride
Outputs -
Return Type - unsigned bytes
Details : Least significant 4 bytes from each input vector are added to
the destination bytes , clipped between 0 - 255 and then stored .
*/