diff --git a/libavcodec/vp9dsp_template.c b/libavcodec/vp9dsp_template.c index da3cc28e5e..9e5b25142d 100644 --- a/libavcodec/vp9dsp_template.c +++ b/libavcodec/vp9dsp_template.c @@ -49,14 +49,22 @@ static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride, { pixel *dst = (pixel *) _dst; const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8 = AV_RN64A(top); +#else pixel4 p4a = AV_RN4PA(top + 0); pixel4 p4b = AV_RN4PA(top + 4); +#endif int y; stride /= sizeof(pixel); for (y = 0; y < 8; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst, p8); +#else AV_WN4PA(dst + 0, p4a); AV_WN4PA(dst + 4, p4b); +#endif dst += stride; } } @@ -66,18 +74,28 @@ static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride, { pixel *dst = (pixel *) _dst; const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8a = AV_RN64A(top); + uint64_t p8b = AV_RN64A(top + 8); +#else pixel4 p4a = AV_RN4PA(top + 0); pixel4 p4b = AV_RN4PA(top + 4); pixel4 p4c = AV_RN4PA(top + 8); pixel4 p4d = AV_RN4PA(top + 12); +#endif int y; stride /= sizeof(pixel); for (y = 0; y < 16; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst + 0, p8a); + AV_WN64A(dst + 8, p8b); +#else AV_WN4PA(dst + 0, p4a); AV_WN4PA(dst + 4, p4b); AV_WN4PA(dst + 8, p4c); AV_WN4PA(dst + 12, p4d); +#endif dst += stride; } } @@ -87,6 +105,12 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride, { pixel *dst = (pixel *) _dst; const pixel *top = (const pixel *) _top; +#if BIT_DEPTH == 8 + uint64_t p8a = AV_RN64A(top); + uint64_t p8b = AV_RN64A(top + 8); + uint64_t p8c = AV_RN64A(top + 16); + uint64_t p8d = AV_RN64A(top + 24); +#else pixel4 p4a = AV_RN4PA(top + 0); pixel4 p4b = AV_RN4PA(top + 4); pixel4 p4c = AV_RN4PA(top + 8); @@ -95,10 +119,17 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride, pixel4 p4f = AV_RN4PA(top + 20); pixel4 p4g = AV_RN4PA(top + 24); pixel4 p4h = AV_RN4PA(top + 28); +#endif int y; stride /= sizeof(pixel); for (y = 0; y < 32; y++) { +#if BIT_DEPTH == 8 + AV_WN64A(dst + 0, p8a); + AV_WN64A(dst + 8, p8b); + AV_WN64A(dst + 16, p8c); + AV_WN64A(dst + 24, p8d); +#else AV_WN4PA(dst + 0, p4a); AV_WN4PA(dst + 4, p4b); AV_WN4PA(dst + 8, p4c); @@ -107,6 +138,7 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride, AV_WN4PA(dst + 20, p4f); AV_WN4PA(dst + 24, p4g); AV_WN4PA(dst + 28, p4h); +#endif dst += stride; } }