3rdparty: update libwebp 0.6.1

pull/10274/head
Alexander Alekhin 7 years ago
parent 5c709f4aaa
commit cd4176104d
  1. 15
      3rdparty/libwebp/CMakeLists.txt
  2. 28
      3rdparty/libwebp/dec/Makefile.am
  3. 14
      3rdparty/libwebp/demux/Makefile.am
  4. 11
      3rdparty/libwebp/demux/libwebpdemux.pc.in
  5. 41
      3rdparty/libwebp/demux/libwebpdemux.rc
  6. 153
      3rdparty/libwebp/dsp/Makefile.am
  7. 42
      3rdparty/libwebp/enc/Makefile.am
  8. 1800
      3rdparty/libwebp/enc/backward_references_enc.c
  9. 19
      3rdparty/libwebp/mux/Makefile.am
  10. 12
      3rdparty/libwebp/mux/libwebpmux.pc.in
  11. 41
      3rdparty/libwebp/mux/libwebpmux.rc
  12. 14
      3rdparty/libwebp/src/dec/alpha_dec.c
  13. 10
      3rdparty/libwebp/src/dec/alphai_dec.h
  14. 51
      3rdparty/libwebp/src/dec/buffer_dec.c
  15. 6
      3rdparty/libwebp/src/dec/common_dec.h
  16. 16
      3rdparty/libwebp/src/dec/frame_dec.c
  17. 14
      3rdparty/libwebp/src/dec/idec_dec.c
  18. 24
      3rdparty/libwebp/src/dec/io_dec.c
  19. 2
      3rdparty/libwebp/src/dec/quant_dec.c
  20. 18
      3rdparty/libwebp/src/dec/tree_dec.c
  21. 12
      3rdparty/libwebp/src/dec/vp8_dec.c
  22. 18
      3rdparty/libwebp/src/dec/vp8_dec.h
  23. 21
      3rdparty/libwebp/src/dec/vp8i_dec.h
  24. 54
      3rdparty/libwebp/src/dec/vp8l_dec.c
  25. 14
      3rdparty/libwebp/src/dec/vp8li_dec.h
  26. 14
      3rdparty/libwebp/src/dec/webp_dec.c
  27. 10
      3rdparty/libwebp/src/dec/webpi_dec.h
  28. 8
      3rdparty/libwebp/src/demux/anim_decode.c
  29. 24
      3rdparty/libwebp/src/demux/demux.c
  30. 114
      3rdparty/libwebp/src/dsp/alpha_processing.c
  31. 55
      3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
  32. 4
      3rdparty/libwebp/src/dsp/alpha_processing_neon.c
  33. 88
      3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
  34. 10
      3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
  35. 0
      3rdparty/libwebp/src/dsp/argb.c
  36. 0
      3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
  37. 3
      3rdparty/libwebp/src/dsp/argb_sse2.c
  38. 0
      3rdparty/libwebp/src/dsp/common_sse2.h
  39. 14
      3rdparty/libwebp/src/dsp/cost.c
  40. 14
      3rdparty/libwebp/src/dsp/cost_mips32.c
  41. 8
      3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
  42. 18
      3rdparty/libwebp/src/dsp/cost_sse2.c
  43. 4
      3rdparty/libwebp/src/dsp/cpu.c
  44. 401
      3rdparty/libwebp/src/dsp/dec.c
  45. 13
      3rdparty/libwebp/src/dsp/dec_clip_tables.c
  46. 4
      3rdparty/libwebp/src/dsp/dec_mips32.c
  47. 4
      3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
  48. 15
      3rdparty/libwebp/src/dsp/dec_msa.c
  49. 727
      3rdparty/libwebp/src/dsp/dec_neon.c
  50. 464
      3rdparty/libwebp/src/dsp/dec_sse2.c
  51. 10
      3rdparty/libwebp/src/dsp/dec_sse41.c
  52. 96
      3rdparty/libwebp/src/dsp/dsp.h
  53. 292
      3rdparty/libwebp/src/dsp/enc.c
  54. 2
      3rdparty/libwebp/src/dsp/enc_avx2.c
  55. 77
      3rdparty/libwebp/src/dsp/enc_mips32.c
  56. 99
      3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
  57. 122
      3rdparty/libwebp/src/dsp/enc_msa.c
  58. 198
      3rdparty/libwebp/src/dsp/enc_neon.c
  59. 477
      3rdparty/libwebp/src/dsp/enc_sse2.c
  60. 66
      3rdparty/libwebp/src/dsp/enc_sse41.c
  61. 139
      3rdparty/libwebp/src/dsp/filters.c
  62. 97
      3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
  63. 22
      3rdparty/libwebp/src/dsp/filters_msa.c
  64. 18
      3rdparty/libwebp/src/dsp/filters_neon.c
  65. 115
      3rdparty/libwebp/src/dsp/filters_sse2.c
  66. 199
      3rdparty/libwebp/src/dsp/lossless.c
  67. 12
      3rdparty/libwebp/src/dsp/lossless.h
  68. 12
      3rdparty/libwebp/src/dsp/lossless_common.h
  69. 134
      3rdparty/libwebp/src/dsp/lossless_enc.c
  70. 59
      3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
  71. 40
      3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
  72. 17
      3rdparty/libwebp/src/dsp/lossless_enc_msa.c
  73. 29
      3rdparty/libwebp/src/dsp/lossless_enc_neon.c
  74. 93
      3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
  75. 9
      3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
  76. 109
      3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
  77. 41
      3rdparty/libwebp/src/dsp/lossless_msa.c
  78. 63
      3rdparty/libwebp/src/dsp/lossless_neon.c
  79. 271
      3rdparty/libwebp/src/dsp/lossless_sse2.c
  80. 0
      3rdparty/libwebp/src/dsp/mips_macro.h
  81. 2
      3rdparty/libwebp/src/dsp/msa_macro.h
  82. 11
      3rdparty/libwebp/src/dsp/neon.h
  83. 43
      3rdparty/libwebp/src/dsp/rescaler.c
  84. 24
      3rdparty/libwebp/src/dsp/rescaler_mips32.c
  85. 14
      3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
  86. 16
      3rdparty/libwebp/src/dsp/rescaler_msa.c
  87. 26
      3rdparty/libwebp/src/dsp/rescaler_neon.c
  88. 90
      3rdparty/libwebp/src/dsp/rescaler_sse2.c
  89. 166
      3rdparty/libwebp/src/dsp/ssim.c
  90. 165
      3rdparty/libwebp/src/dsp/ssim_sse2.c
  91. 161
      3rdparty/libwebp/src/dsp/upsampling.c
  92. 47
      3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
  93. 50
      3rdparty/libwebp/src/dsp/upsampling_msa.c
  94. 52
      3rdparty/libwebp/src/dsp/upsampling_neon.c
  95. 116
      3rdparty/libwebp/src/dsp/upsampling_sse2.c
  96. 98
      3rdparty/libwebp/src/dsp/yuv.c
  97. 75
      3rdparty/libwebp/src/dsp/yuv.h
  98. 20
      3rdparty/libwebp/src/dsp/yuv_mips32.c
  99. 20
      3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
  100. 288
      3rdparty/libwebp/src/dsp/yuv_neon.c
  101. Some files were not shown because too many files have changed in this diff Show More

@ -9,8 +9,8 @@ if(ANDROID)
ocv_include_directories(${CPUFEATURES_INCLUDE_DIRS})
endif()
file(GLOB lib_srcs dec/*.c demux/*.c dsp/*.c enc/*.c mux/*.c utils/*.c webp/*.c)
file(GLOB lib_hdrs dec/*.h demux/*.h dsp/*.h enc/*.h mux/*.h utils/*.h webp/*.h)
file(GLOB lib_srcs src/dec/*.c src/demux/*.c src/dsp/*.c src/enc/*.c src/mux/*.c src/utils/*.c src/webp/*.c)
file(GLOB lib_hdrs src/dec/*.h src/demux/*.h src/dsp/*.h src/enc/*.h src/mux/*.h src/utils/*.h src/webp/*.h)
# FIXIT
if(ANDROID AND ARMEABI_V7A AND NOT NEON)
@ -34,13 +34,10 @@ if(ANDROID)
target_link_libraries(${WEBP_LIBRARY} ${CPUFEATURES_LIBRARIES})
endif()
if(UNIX)
if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
endif()
endif()
ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wunused-function -Wshadow -Wmaybe-uninitialized)
ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wunused-function -Wshadow -Wmaybe-uninitialized
-Wmissing-prototypes # clang
-Wmissing-declarations # gcc
)
ocv_warnings_disable(CMAKE_C_FLAGS /wd4244 /wd4267) # vs2005
set_target_properties(${WEBP_LIBRARY}

@ -1,28 +0,0 @@
noinst_LTLIBRARIES = libwebpdecode.la
libwebpdecode_la_SOURCES =
libwebpdecode_la_SOURCES += alpha_dec.c
libwebpdecode_la_SOURCES += alphai_dec.h
libwebpdecode_la_SOURCES += buffer_dec.c
libwebpdecode_la_SOURCES += common_dec.h
libwebpdecode_la_SOURCES += vp8_dec.h
libwebpdecode_la_SOURCES += frame_dec.c
libwebpdecode_la_SOURCES += idec_dec.c
libwebpdecode_la_SOURCES += io_dec.c
libwebpdecode_la_SOURCES += quant_dec.c
libwebpdecode_la_SOURCES += tree_dec.c
libwebpdecode_la_SOURCES += vp8_dec.c
libwebpdecode_la_SOURCES += vp8i_dec.h
libwebpdecode_la_SOURCES += vp8l_dec.c
libwebpdecode_la_SOURCES += vp8li_dec.h
libwebpdecode_la_SOURCES += webp_dec.c
libwebpdecode_la_SOURCES += webpi_dec.h
libwebpdecodeinclude_HEADERS =
libwebpdecodeinclude_HEADERS += ../webp/decode.h
libwebpdecodeinclude_HEADERS += ../webp/types.h
noinst_HEADERS =
noinst_HEADERS += ../webp/format_constants.h
libwebpdecode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
libwebpdecodeincludedir = $(includedir)/webp

@ -1,14 +0,0 @@
lib_LTLIBRARIES = libwebpdemux.la
libwebpdemux_la_SOURCES =
libwebpdemux_la_SOURCES += anim_decode.c demux.c
libwebpdemuxinclude_HEADERS =
libwebpdemuxinclude_HEADERS += ../webp/demux.h
libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
libwebpdemuxinclude_HEADERS += ../webp/types.h
libwebpdemux_la_LIBADD = ../libwebp.la
libwebpdemux_la_LDFLAGS = -no-undefined -version-info 2:2:0
libwebpdemuxincludedir = $(includedir)/webp
pkgconfig_DATA = libwebpdemux.pc

@ -1,11 +0,0 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: libwebpdemux
Description: Library for parsing the WebP graphics format container
Version: @PACKAGE_VERSION@
Requires: libwebp >= 0.2.0
Cflags: -I${includedir}
Libs: -L${libdir} -lwebpdemux

@ -1,41 +0,0 @@
#define APSTUDIO_READONLY_SYMBOLS
#include "winres.h"
#undef APSTUDIO_READONLY_SYMBOLS
#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,3,0,2
PRODUCTVERSION 0,3,0,2
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x1L
#else
FILEFLAGS 0x0L
#endif
FILEOS 0x40004L
FILETYPE 0x2L
FILESUBTYPE 0x0L
BEGIN
BLOCK "StringFileInfo"
BEGIN
BLOCK "040904b0"
BEGIN
VALUE "CompanyName", "Google, Inc."
VALUE "FileDescription", "libwebpdemux DLL"
VALUE "FileVersion", "0.3.2"
VALUE "InternalName", "libwebpdemux.dll"
VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "OriginalFilename", "libwebpdemux.dll"
VALUE "ProductName", "WebP Image Demuxer"
VALUE "ProductVersion", "0.3.2"
END
END
BLOCK "VarFileInfo"
BEGIN
VALUE "Translation", 0x409, 1200
END
END
#endif // English (United States) resources

@ -1,153 +0,0 @@
noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
if BUILD_LIBWEBPDECODER
noinst_LTLIBRARIES += libwebpdspdecode.la
endif
common_HEADERS = ../webp/types.h
commondir = $(includedir)/webp
COMMON_SOURCES =
COMMON_SOURCES += alpha_processing.c
COMMON_SOURCES += alpha_processing_mips_dsp_r2.c
COMMON_SOURCES += common_sse2.h
COMMON_SOURCES += cpu.c
COMMON_SOURCES += dec.c
COMMON_SOURCES += dec_clip_tables.c
COMMON_SOURCES += dec_mips32.c
COMMON_SOURCES += dec_mips_dsp_r2.c
COMMON_SOURCES += dsp.h
COMMON_SOURCES += filters.c
COMMON_SOURCES += filters_mips_dsp_r2.c
COMMON_SOURCES += lossless.c
COMMON_SOURCES += lossless.h
COMMON_SOURCES += lossless_common.h
COMMON_SOURCES += lossless_mips_dsp_r2.c
COMMON_SOURCES += mips_macro.h
COMMON_SOURCES += rescaler.c
COMMON_SOURCES += rescaler_mips32.c
COMMON_SOURCES += rescaler_mips_dsp_r2.c
COMMON_SOURCES += upsampling.c
COMMON_SOURCES += upsampling_mips_dsp_r2.c
COMMON_SOURCES += yuv.c
COMMON_SOURCES += yuv.h
COMMON_SOURCES += yuv_mips32.c
COMMON_SOURCES += yuv_mips_dsp_r2.c
ENC_SOURCES =
ENC_SOURCES += argb.c
ENC_SOURCES += argb_mips_dsp_r2.c
ENC_SOURCES += cost.c
ENC_SOURCES += cost_mips32.c
ENC_SOURCES += cost_mips_dsp_r2.c
ENC_SOURCES += enc.c
ENC_SOURCES += enc_mips32.c
ENC_SOURCES += enc_mips_dsp_r2.c
ENC_SOURCES += lossless_enc.c
ENC_SOURCES += lossless_enc_mips32.c
ENC_SOURCES += lossless_enc_mips_dsp_r2.c
libwebpdsp_avx2_la_SOURCES =
libwebpdsp_avx2_la_SOURCES += enc_avx2.c
libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
libwebpdspdecode_sse41_la_SOURCES =
libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
libwebpdspdecode_sse2_la_SOURCES =
libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
libwebpdspdecode_sse2_la_SOURCES += filters_sse2.c
libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
libwebpdspdecode_sse2_la_SOURCES += rescaler_sse2.c
libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
libwebpdspdecode_sse2_la_SOURCES += yuv_sse2.c
libwebpdspdecode_sse2_la_CPPFLAGS = $(libwebpdsp_sse2_la_CPPFLAGS)
libwebpdspdecode_sse2_la_CFLAGS = $(libwebpdsp_sse2_la_CFLAGS)
libwebpdspdecode_neon_la_SOURCES =
libwebpdspdecode_neon_la_SOURCES += alpha_processing_neon.c
libwebpdspdecode_neon_la_SOURCES += dec_neon.c
libwebpdspdecode_neon_la_SOURCES += filters_neon.c
libwebpdspdecode_neon_la_SOURCES += lossless_neon.c
libwebpdspdecode_neon_la_SOURCES += neon.h
libwebpdspdecode_neon_la_SOURCES += rescaler_neon.c
libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c
libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS)
libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS)
libwebpdspdecode_msa_la_SOURCES =
libwebpdspdecode_msa_la_SOURCES += dec_msa.c
libwebpdspdecode_msa_la_SOURCES += filters_msa.c
libwebpdspdecode_msa_la_SOURCES += lossless_msa.c
libwebpdspdecode_msa_la_SOURCES += msa_macro.h
libwebpdspdecode_msa_la_SOURCES += rescaler_msa.c
libwebpdspdecode_msa_la_SOURCES += upsampling_msa.c
libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)
libwebpdsp_sse2_la_SOURCES =
libwebpdsp_sse2_la_SOURCES += argb_sse2.c
libwebpdsp_sse2_la_SOURCES += cost_sse2.c
libwebpdsp_sse2_la_SOURCES += enc_sse2.c
libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
libwebpdsp_sse41_la_SOURCES =
libwebpdsp_sse41_la_SOURCES += enc_sse41.c
libwebpdsp_sse41_la_SOURCES += lossless_enc_sse41.c
libwebpdsp_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdsp_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)
libwebpdsp_sse41_la_LIBADD = libwebpdspdecode_sse41.la
libwebpdsp_neon_la_SOURCES =
libwebpdsp_neon_la_SOURCES += enc_neon.c
libwebpdsp_neon_la_SOURCES += lossless_enc_neon.c
libwebpdsp_neon_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdsp_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
libwebpdsp_neon_la_LIBADD = libwebpdspdecode_neon.la
libwebpdsp_msa_la_SOURCES =
libwebpdsp_msa_la_SOURCES += enc_msa.c
libwebpdsp_msa_la_SOURCES += lossless_enc_msa.c
libwebpdsp_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdsp_msa_la_CFLAGS = $(AM_CFLAGS)
libwebpdsp_msa_la_LIBADD = libwebpdspdecode_msa.la
libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
noinst_HEADERS =
noinst_HEADERS += ../dec/vp8_dec.h
noinst_HEADERS += ../webp/decode.h
libwebpdsp_la_CPPFLAGS =
libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
libwebpdsp_la_LDFLAGS = -lm
libwebpdsp_la_LIBADD =
libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
libwebpdsp_la_LIBADD += libwebpdsp_neon.la
libwebpdsp_la_LIBADD += libwebpdsp_msa.la
if BUILD_LIBWEBPDECODER
libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
libwebpdspdecode_la_LIBADD =
libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la
libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la
endif

@ -1,42 +0,0 @@
noinst_LTLIBRARIES = libwebpencode.la
libwebpencode_la_SOURCES =
libwebpencode_la_SOURCES += alpha_enc.c
libwebpencode_la_SOURCES += analysis_enc.c
libwebpencode_la_SOURCES += backward_references_enc.c
libwebpencode_la_SOURCES += backward_references_enc.h
libwebpencode_la_SOURCES += config_enc.c
libwebpencode_la_SOURCES += cost_enc.c
libwebpencode_la_SOURCES += cost_enc.h
libwebpencode_la_SOURCES += delta_palettization_enc.c
libwebpencode_la_SOURCES += delta_palettization_enc.h
libwebpencode_la_SOURCES += filter_enc.c
libwebpencode_la_SOURCES += frame_enc.c
libwebpencode_la_SOURCES += histogram_enc.c
libwebpencode_la_SOURCES += histogram_enc.h
libwebpencode_la_SOURCES += iterator_enc.c
libwebpencode_la_SOURCES += near_lossless_enc.c
libwebpencode_la_SOURCES += picture_enc.c
libwebpencode_la_SOURCES += picture_csp_enc.c
libwebpencode_la_SOURCES += picture_psnr_enc.c
libwebpencode_la_SOURCES += picture_rescale_enc.c
libwebpencode_la_SOURCES += picture_tools_enc.c
libwebpencode_la_SOURCES += predictor_enc.c
libwebpencode_la_SOURCES += quant_enc.c
libwebpencode_la_SOURCES += syntax_enc.c
libwebpencode_la_SOURCES += token_enc.c
libwebpencode_la_SOURCES += tree_enc.c
libwebpencode_la_SOURCES += vp8i_enc.h
libwebpencode_la_SOURCES += vp8l_enc.c
libwebpencode_la_SOURCES += vp8li_enc.h
libwebpencode_la_SOURCES += webp_enc.c
libwebpencodeinclude_HEADERS =
libwebpencodeinclude_HEADERS += ../webp/encode.h
libwebpencodeinclude_HEADERS += ../webp/types.h
noinst_HEADERS =
noinst_HEADERS += ../webp/format_constants.h
libwebpencode_la_LDFLAGS = -lm
libwebpencode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
libwebpencodeincludedir = $(includedir)/webp

File diff suppressed because it is too large Load Diff

@ -1,19 +0,0 @@
lib_LTLIBRARIES = libwebpmux.la
libwebpmux_la_SOURCES =
libwebpmux_la_SOURCES += anim_encode.c
libwebpmux_la_SOURCES += animi.h
libwebpmux_la_SOURCES += muxedit.c
libwebpmux_la_SOURCES += muxi.h
libwebpmux_la_SOURCES += muxinternal.c
libwebpmux_la_SOURCES += muxread.c
libwebpmuxinclude_HEADERS =
libwebpmuxinclude_HEADERS += ../webp/mux.h
libwebpmuxinclude_HEADERS += ../webp/mux_types.h
libwebpmuxinclude_HEADERS += ../webp/types.h
libwebpmux_la_LIBADD = ../libwebp.la
libwebpmux_la_LDFLAGS = -no-undefined -version-info 3:0:0 -lm
libwebpmuxincludedir = $(includedir)/webp
pkgconfig_DATA = libwebpmux.pc

@ -1,12 +0,0 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: libwebpmux
Description: Library for manipulating the WebP graphics format container
Version: @PACKAGE_VERSION@
Requires: libwebp >= 0.2.0
Cflags: -I${includedir}
Libs: -L${libdir} -lwebpmux
Libs.private: -lm

@ -1,41 +0,0 @@
#define APSTUDIO_READONLY_SYMBOLS
#include "winres.h"
#undef APSTUDIO_READONLY_SYMBOLS
#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
VS_VERSION_INFO VERSIONINFO
FILEVERSION 0,4,0,0
PRODUCTVERSION 0,4,0,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x1L
#else
FILEFLAGS 0x0L
#endif
FILEOS 0x40004L
FILETYPE 0x2L
FILESUBTYPE 0x0L
BEGIN
BLOCK "StringFileInfo"
BEGIN
BLOCK "040904b0"
BEGIN
VALUE "CompanyName", "Google, Inc."
VALUE "FileDescription", "libwebpmux DLL"
VALUE "FileVersion", "0.4.0"
VALUE "InternalName", "libwebpmux.dll"
VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "OriginalFilename", "libwebpmux.dll"
VALUE "ProductName", "WebP Image Muxer"
VALUE "ProductVersion", "0.4.0"
END
END
BLOCK "VarFileInfo"
BEGIN
VALUE "Translation", 0x409, 1200
END
END
#endif // English (United States) resources

@ -12,13 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "../dsp/dsp.h"
#include "../utils/quant_levels_dec_utils.h"
#include "../utils/utils.h"
#include "../webp/format_constants.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dsp/dsp.h"
#include "src/utils/quant_levels_dec_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
//------------------------------------------------------------------------------
// ALPHDecoder object.

@ -11,11 +11,11 @@
//
// Author: Urvang (urvang@google.com)
#ifndef WEBP_DEC_ALPHAI_H_
#define WEBP_DEC_ALPHAI_H_
#ifndef WEBP_DEC_ALPHAI_DEC_H_
#define WEBP_DEC_ALPHAI_DEC_H_
#include "./webpi_dec.h"
#include "../utils/filters_utils.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/filters_utils.h"
#ifdef __cplusplus
extern "C" {
@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
} // extern "C"
#endif
#endif /* WEBP_DEC_ALPHAI_H_ */
#endif /* WEBP_DEC_ALPHAI_DEC_H_ */

@ -13,15 +13,15 @@
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "./webpi_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// WebPDecBuffer
// Number of bytes per pixel for the different color-spaces.
static const int kModeBpp[MODE_LAST] = {
static const uint8_t kModeBpp[MODE_LAST] = {
3, 4, 3, 4, 4, 2, 2,
4, 4, 4, 2, // pre-multiplied modes
1, 1 };
@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
// strictly speaking, the very last (or first, if flipped) row
// doesn't require padding.
#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE) \
(uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
int ok = 1;
@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
uint64_t uv_size = 0, a_size = 0, total_size;
// We need memory and it hasn't been allocated yet.
// => initialize output buffer, now that dimensions are known.
const int stride = w * kModeBpp[mode];
const uint64_t size = (uint64_t)stride * h;
int stride;
uint64_t size;
if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
return VP8_STATUS_INVALID_PARAM;
}
stride = w * kModeBpp[mode];
size = (uint64_t)stride * h;
if (!WebPIsRGBMode(mode)) {
uv_stride = (w + 1) / 2;
uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
return VP8_STATUS_OK;
}
VP8StatusCode WebPAllocateDecBuffer(int w, int h,
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const WebPDecoderOptions* const options,
WebPDecBuffer* const out) {
WebPDecBuffer* const buffer) {
VP8StatusCode status;
if (out == NULL || w <= 0 || h <= 0) {
if (buffer == NULL || width <= 0 || height <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
if (options != NULL) { // First, apply options if there is any.
@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
const int ch = options->crop_height;
const int x = options->crop_left & ~1;
const int y = options->crop_top & ~1;
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
x + cw > width || y + ch > height) {
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
}
w = cw;
h = ch;
width = cw;
height = ch;
}
if (options->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
int scaled_width = options->scaled_width;
int scaled_height = options->scaled_height;
if (!WebPRescalerGetScaledDimensions(
w, h, &scaled_width, &scaled_height)) {
width, height, &scaled_width, &scaled_height)) {
return VP8_STATUS_INVALID_PARAM;
}
w = scaled_width;
h = scaled_height;
width = scaled_width;
height = scaled_height;
#else
return VP8_STATUS_INVALID_PARAM; // rescaling not supported
#endif
}
}
out->width = w;
out->height = h;
buffer->width = width;
buffer->height = height;
// Then, allocate buffer for real.
status = AllocateBuffer(out);
status = AllocateBuffer(buffer);
if (status != VP8_STATUS_OK) return status;
// Use the stride trick if vertical flip is needed.
if (options != NULL && options->flip) {
status = WebPFlipBuffer(out);
status = WebPFlipBuffer(buffer);
}
return status;
}

@ -11,8 +11,8 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_COMMON_H_
#define WEBP_DEC_COMMON_H_
#ifndef WEBP_DEC_COMMON_DEC_H_
#define WEBP_DEC_COMMON_DEC_H_
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
NUM_PROBAS = 11
};
#endif // WEBP_DEC_COMMON_H_
#endif // WEBP_DEC_COMMON_DEC_H_

@ -12,13 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Main reconstruction function.
static const int kScan[16] = {
static const uint16_t kScan[16] = {
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
#define MIN_DITHER_AMP 4
#define DITHER_AMP_TAB_SIZE 12
static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
// roughly, it's dqm->uv_mat_[1]
8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
};
@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
}
mem = (uint8_t*)dec->mem_;
dec->intra_t_ = (uint8_t*)mem;
dec->intra_t_ = mem;
mem += intra_pred_mode_size;
dec->yuv_t_ = (VP8TopSamples*)mem;
@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem = (uint8_t*)WEBP_ALIGN(mem);
assert((yuv_size & WEBP_ALIGN_CST) == 0);
dec->yuv_b_ = (uint8_t*)mem;
dec->yuv_b_ = mem;
mem += yuv_size;
dec->mb_data_ = (VP8MBData*)mem;
@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
const int extra_rows = kFilterExtraRows[dec->filter_type_];
const int extra_y = extra_rows * dec->cache_y_stride_;
const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
dec->cache_y_ = ((uint8_t*)mem) + extra_y;
dec->cache_y_ = mem + extra_y;
dec->cache_u_ = dec->cache_y_
+ 16 * num_caches * dec->cache_y_stride_ + extra_uv;
dec->cache_v_ = dec->cache_u_
@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem += cache_size;
// alpha plane
dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
dec->alpha_plane_ = alpha_size ? mem : NULL;
mem += alpha_size;
assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

@ -15,10 +15,10 @@
#include <string.h>
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./webpi_dec.h"
#include "./vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
// In append mode, buffer allocations increase as multiples of this value.
// Needs to be a power of 2.
@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
//------------------------------------------------------------------------------
// Wrapper toward WebPINewDecoder
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
size_t output_buffer_size, int output_stride) {
const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
WebPIDecoder* idec;
if (mode >= MODE_YUV) return NULL;
if (csp >= MODE_YUV) return NULL;
if (is_external_memory == 0) { // Overwrite parameters to sane values.
output_buffer_size = 0;
output_stride = 0;
@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
}
idec = WebPINewDecoder(NULL);
if (idec == NULL) return NULL;
idec->output_.colorspace = mode;
idec->output_.colorspace = csp;
idec->output_.is_external_memory = is_external_memory;
idec->output_.u.RGBA.rgba = output_buffer;
idec->output_.u.RGBA.stride = output_stride;

@ -13,11 +13,11 @@
#include <assert.h>
#include <stdlib.h>
#include "../dec/vp8i_dec.h"
#include "./webpi_dec.h"
#include "../dsp/dsp.h"
#include "../dsp/yuv.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/dsp/dsp.h"
#include "src/dsp/yuv.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Main YUV<->RGB conversion functions
@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
int num_rows;
const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
//------------------------------------------------------------------------------
// YUV rescaling (no final RGB conversion needed)
#if !defined(WEBP_REDUCE_SIZE)
static int Rescale(const uint8_t* src, int src_stride,
int new_lines, WebPRescaler* const wrk) {
int num_lines_out = 0;
@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
return 1;
}
#endif // WEBP_REDUCE_SIZE
//------------------------------------------------------------------------------
// Default custom functions
@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
WebPInitUpsamplers();
}
if (io->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
if (!ok) {
return 0; // memory error
}
#else
return 0; // rescaling support not compiled
#endif
} else {
if (is_rgb) {
WebPInitSamplers();
@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
}
}
if (is_rgb) {
VP8YUVInit();
}
return 1;
}

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./vp8i_dec.h"
#include "src/dec/vp8i_dec.h"
static WEBP_INLINE int clip(int v, int M) {
return v < 0 ? 0 : v > M ? M : v;

@ -11,15 +11,19 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./vp8i_dec.h"
#include "../utils/bit_reader_inl_utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else
#define USE_GENERIC_TREE 0
#endif
#endif // USE_GENERIC_TREE
#ifdef USE_GENERIC_TREE
#if (USE_GENERIC_TREE == 1)
static const int8_t kYModesIntra4[18] = {
-B_DC_PRED, 1,
-B_TM_PRED, 2,
@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
int x;
for (x = 0; x < 4; ++x) {
const uint8_t* const prob = kBModesProba[top[x]][ymode];
#ifdef USE_GENERIC_TREE
#if (USE_GENERIC_TREE == 1)
// Generic tree-parsing
int i = kYModesIntra4[VP8GetBit(br, prob[0])];
while (i > 0) {
@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
(!VP8GetBit(br, prob[6]) ? B_LD_PRED :
(!VP8GetBit(br, prob[7]) ? B_VL_PRED :
(!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
#endif // USE_GENERIC_TREE
#endif // USE_GENERIC_TREE
top[x] = ymode;
}
memcpy(modes, top, 4 * sizeof(*top));
@ -498,7 +502,7 @@ static const uint8_t
// Paragraph 9.9
static const int kBands[16 + 1] = {
static const uint8_t kBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // extra entry as sentinel
};

@ -13,12 +13,12 @@
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "./webpi_dec.h"
#include "../utils/bit_reader_inl_utils.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/bit_reader_inl_utils.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------

@ -11,10 +11,10 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_WEBP_DECODE_VP8_H_
#define WEBP_WEBP_DECODE_VP8_H_
#ifndef WEBP_DEC_VP8_DEC_H_
#define WEBP_DEC_VP8_DEC_H_
#include "../webp/decode.h"
#include "src/webp/decode.h"
#ifdef __cplusplus
extern "C" {
@ -33,7 +33,7 @@ extern "C" {
// /* customize io's functions (setup()/put()/teardown()) if needed. */
//
// VP8Decoder* dec = VP8New();
// bool ok = VP8Decode(dec);
// int ok = VP8Decode(dec, &io);
// if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
// VP8Delete(dec);
// return ok;
@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
// Miscellaneous VP8/VP8L bitstream probing functions.
// Returns true if the next 3 bytes in data contain the VP8 signature.
WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
// Validates the VP8 data-header and retrieves basic header information viz
// width and height. Returns 0 in case of formatting error. *width/*height
// can be passed NULL.
WEBP_EXTERN(int) VP8GetInfo(
WEBP_EXTERN int VP8GetInfo(
const uint8_t* data,
size_t data_size, // data available so far
size_t chunk_size, // total data size expected in the chunk
int* const width, int* const height);
// Returns true if the next byte(s) in data is a VP8L signature.
WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
// Validates the VP8L data-header and retrieves basic header information viz
// width, height and alpha. Returns 0 in case of formatting error.
// width/height/has_alpha can be passed NULL.
WEBP_EXTERN(int) VP8LGetInfo(
WEBP_EXTERN int VP8LGetInfo(
const uint8_t* data, size_t data_size, // data available so far
int* const width, int* const height, int* const has_alpha);
@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
} // extern "C"
#endif
#endif /* WEBP_WEBP_DECODE_VP8_H_ */
#endif /* WEBP_DEC_VP8_DEC_H_ */

@ -11,16 +11,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_VP8I_H_
#define WEBP_DEC_VP8I_H_
#ifndef WEBP_DEC_VP8I_DEC_H_
#define WEBP_DEC_VP8I_DEC_H_
#include <string.h> // for memcpy()
#include "./common_dec.h"
#include "./vp8li_dec.h"
#include "../utils/bit_reader_utils.h"
#include "../utils/random_utils.h"
#include "../utils/thread_utils.h"
#include "../dsp/dsp.h"
#include "src/dec/common_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/bit_reader_utils.h"
#include "src/utils/random_utils.h"
#include "src/utils/thread_utils.h"
#include "src/dsp/dsp.h"
#ifdef __cplusplus
extern "C" {
@ -32,7 +32,7 @@ extern "C" {
// version numbers
#define DEC_MAJ_VERSION 0
#define DEC_MIN_VERSION 6
#define DEC_REV_VERSION 0
#define DEC_REV_VERSION 1
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// Constraints are: We need to store one 16x16 block of luma samples (y),
@ -57,7 +57,6 @@ extern "C" {
// '|' = left sample, '-' = top sample, '+' = top-left sample
// 't' = extra top-right sample for 4x4 modes
#define YUV_SIZE (BPS * 17 + BPS * 9)
#define Y_SIZE (BPS * 17)
#define Y_OFF (BPS * 1 + 8)
#define U_OFF (Y_OFF + BPS * 16 + BPS)
#define V_OFF (U_OFF + 16)
@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
} // extern "C"
#endif
#endif /* WEBP_DEC_VP8I_H_ */
#endif /* WEBP_DEC_VP8I_DEC_H_ */

@ -14,22 +14,22 @@
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8li_dec.h"
#include "../dsp/dsp.h"
#include "../dsp/lossless.h"
#include "../dsp/lossless_common.h"
#include "../dsp/yuv.h"
#include "../utils/endian_inl_utils.h"
#include "../utils/huffman_utils.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/huffman_utils.h"
#include "src/utils/utils.h"
#define NUM_ARGB_CACHE_ROWS 16
static const int kCodeLengthLiterals = 16;
static const int kCodeLengthRepeatCode = 16;
static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
// -----------------------------------------------------------------------------
// Five Huffman codes are used at each meta code:
@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
// All values computed for 8-bit first level lookup with Mark Adler's tool:
// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
#define FIXED_TABLE_SIZE (630 * 3 + 410)
static const int kTableSize[12] = {
static const uint16_t kTableSize[12] = {
FIXED_TABLE_SIZE + 654,
FIXED_TABLE_SIZE + 656,
FIXED_TABLE_SIZE + 658,
@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
//------------------------------------------------------------------------------
// Scaling.
#if !defined(WEBP_REDUCE_SIZE)
static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
const int num_channels = 4;
const int in_width = io->mb_w;
@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
out_width, out_height, 0, num_channels, work);
return 1;
}
#endif // WEBP_REDUCE_SIZE
//------------------------------------------------------------------------------
// Export to ARGB
#if !defined(WEBP_REDUCE_SIZE)
// We have special "export" function since we need to convert from BGRA
static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
int rgba_stride, uint8_t* const rgba) {
@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
return num_lines_out;
}
#endif // WEBP_REDUCE_SIZE
// Emit rows without any scaling.
static int EmitRows(WEBP_CSP_MODE colorspace,
const uint8_t* row_in, int in_stride,
@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
if (WebPIsRGBMode(output->colorspace)) { // convert to RGBA
const WebPRGBABuffer* const buf = &output->u.RGBA;
uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
const int num_rows_out = io->use_scaling ?
const int num_rows_out =
#if !defined(WEBP_REDUCE_SIZE)
io->use_scaling ?
EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
rgba, buf->stride) :
#endif // WEBP_REDUCE_SIZE
EmitRows(output->colorspace, rows_data, in_stride,
io->mb_w, io->mb_h, rgba, buf->stride);
// Update 'last_out_row_'.
@ -1012,12 +1021,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
ok = 0;
goto End;
}
assert(br->eos_ == VP8LIsEndOfStream(br));
br->eos_ = VP8LIsEndOfStream(br);
}
// Process the remaining rows corresponding to last row-block.
ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);
End:
br->eos_ = VP8LIsEndOfStream(br);
if (!ok || (br->eos_ && pos < end)) {
ok = 0;
dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@ -1090,11 +1100,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
if (htree_group->use_packed_table) {
code = ReadPackedSymbols(htree_group, br, src);
if (VP8LIsEndOfStream(br)) break;
if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
} else {
code = ReadSymbol(htree_group->htrees[GREEN], br);
}
if (br->eos_) break; // early out
if (VP8LIsEndOfStream(br)) break;
if (code < NUM_LITERAL_CODES) { // Literal
if (htree_group->is_trivial_literal) {
*src = htree_group->literal_arb | (code << 8);
@ -1104,7 +1115,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
blue = ReadSymbol(htree_group->htrees[BLUE], br);
alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
if (br->eos_) break;
if (VP8LIsEndOfStream(br)) break;
*src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
}
AdvanceByOne:
@ -1132,7 +1143,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
dist_code = GetCopyDistance(dist_symbol, br);
dist = PlaneCodeToDistance(width, dist_code);
if (br->eos_) break;
if (VP8LIsEndOfStream(br)) break;
if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
goto Error;
} else {
@ -1169,9 +1180,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
} else { // Not reached
goto Error;
}
assert(br->eos_ == VP8LIsEndOfStream(br));
}
br->eos_ = VP8LIsEndOfStream(br);
if (dec->incremental_ && br->eos_ && src < src_end) {
RestoreState(dec);
} else if (!br->eos_) {
@ -1630,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
#if !defined(WEBP_REDUCE_SIZE)
if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
// need the alpha-multiply functions for premultiplied output or rescaling
WebPInitAlphaProcessing();
}
#else
if (io->use_scaling) {
dec->status_ = VP8_STATUS_INVALID_PARAM;
goto Err;
}
#endif
if (!WebPIsRGBMode(dec->output_->colorspace)) {
WebPInitConvertARGBToYUV();
if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();

@ -12,14 +12,14 @@
// Author: Skal (pascal.massimino@gmail.com)
// Vikas Arora(vikaas.arora@gmail.com)
#ifndef WEBP_DEC_VP8LI_H_
#define WEBP_DEC_VP8LI_H_
#ifndef WEBP_DEC_VP8LI_DEC_H_
#define WEBP_DEC_VP8LI_DEC_H_
#include <string.h> // for memcpy()
#include "./webpi_dec.h"
#include "../utils/bit_reader_utils.h"
#include "../utils/color_cache_utils.h"
#include "../utils/huffman_utils.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/bit_reader_utils.h"
#include "src/utils/color_cache_utils.h"
#include "src/utils/huffman_utils.h"
#ifdef __cplusplus
extern "C" {
@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
} // extern "C"
#endif
#endif /* WEBP_DEC_VP8LI_H_ */
#endif /* WEBP_DEC_VP8LI_DEC_H_ */

@ -13,11 +13,11 @@
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "./webpi_dec.h"
#include "../utils/utils.h"
#include "../webp/mux_types.h" // ALPHA_FLAG
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/utils.h"
#include "src/webp/mux_types.h" // ALPHA_FLAG
//------------------------------------------------------------------------------
// RIFF layout is:
@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
NULL, NULL, NULL, &has_animation,
NULL, headers);
if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
// TODO(jzern): full support of animation frames will require API additions.
// The WebPDemux API + libwebp can be used to decode individual
// uncomposited frames or the WebPAnimDecoder can be used to fully
// reconstruct them (see webp/demux.h).
if (has_animation) {
status = VP8_STATUS_UNSUPPORTED_FEATURE;
}

@ -11,15 +11,15 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#ifndef WEBP_DEC_WEBPI_H_
#define WEBP_DEC_WEBPI_H_
#ifndef WEBP_DEC_WEBPI_DEC_H_
#define WEBP_DEC_WEBPI_DEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "../utils/rescaler_utils.h"
#include "./vp8_dec.h"
#include "src/utils/rescaler_utils.h"
#include "src/dec/vp8_dec.h"
//------------------------------------------------------------------------------
// WebPDecParams: Decoding output parameters. Transient internal object.
@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
} // extern "C"
#endif
#endif /* WEBP_DEC_WEBPI_H_ */
#endif /* WEBP_DEC_WEBPI_DEC_H_ */

@ -11,15 +11,15 @@
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include <assert.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h"
#include "../webp/demux.h"
#include "src/utils/utils.h"
#include "src/webp/decode.h"
#include "src/webp/demux.h"
#define NUM_CHANNELS 4

@ -11,21 +11,21 @@
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h" // WebPGetFeatures
#include "../webp/demux.h"
#include "../webp/format_constants.h"
#include "src/utils/utils.h"
#include "src/webp/decode.h" // WebPGetFeatures
#include "src/webp/demux.h"
#include "src/webp/format_constants.h"
#define DMUX_MAJ_VERSION 0
#define DMUX_MIN_VERSION 3
#define DMUX_REV_VERSION 2
#define DMUX_REV_VERSION 3
typedef struct {
size_t start_; // start location of the data
@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
frame->complete_ = complete;
}
// Store image bearing chunks to 'frame'.
// Store image bearing chunks to 'frame'. 'min_size' is an optional size
// requirement, it may be zero.
static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
MemBuffer* const mem, Frame* const frame) {
int alpha_chunks = 0;
int image_chunks = 0;
int done = (MemDataSize(mem) < min_size);
int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
MemDataSize(mem) < min_size);
ParseStatus status = PARSE_OK;
if (done) return PARSE_NEED_MORE_DATA;
@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
if (frame == NULL) return PARSE_ERROR;
// For the single image case we allow parsing of a partial frame, but we need
// at least CHUNK_HEADER_SIZE for parsing.
status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
// For the single image case we allow parsing of a partial frame, so no
// minimum size is imposed here.
status = StoreFrame(1, 0, &dmux->mem_, frame);
if (status != PARSE_ERROR) {
const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
// Clear any alpha when the alpha flag is missing.

@ -12,10 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
// Tables can be faster on some platform but incur some extra binary size (~2k).
// #define USE_TABLES_FOR_ALPHA_MULT
#if !defined(USE_TABLES_FOR_ALPHA_MULT)
#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE
#endif
// -----------------------------------------------------------------------------
@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
return v;
}
#ifdef USE_TABLES_FOR_ALPHA_MULT
#if (USE_TABLES_FOR_ALPHA_MULT == 1)
static const uint32_t kMultTables[2][256] = {
{ // (255u << MFIX) / alpha
@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return inverse ? (255u << MFIX) / a : a * KINV_255;
}
#endif // USE_TABLES_FOR_ALPHA_MULT
#endif // USE_TABLES_FOR_ALPHA_MULT
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
}
}
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t a = alpha[x];
@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
#endif
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
#if !WEBP_NEON_OMIT_C_CODE
static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
while (h-- > 0) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
rgba += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MULTIPLIER
#undef PREMULTIPLY
@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
return (x * m) >> 16;
}
static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
while (h-- > 0) {
int i;
for (i = 0; i < w; ++i) {
@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
}
#undef MULTIPLIER
static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
int w, int h, int stride) {
#ifdef WEBP_SWAP_16BIT_CSP
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
int w, int h, int stride) {
#if (WEBP_SWAP_16BIT_CSP == 1)
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
#else
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
#endif
}
#if !WEBP_NEON_OMIT_C_CODE
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static int HasAlpha8b_C(const uint8_t* src, int length) {
while (length-- > 0) if (*src++ != 0xff) return 1;
return 0;
}
static int HasAlpha32b_C(const uint8_t* src, int length) {
int x;
for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
return 0;
}
//------------------------------------------------------------------------------
// Simple channel manipulations.
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
//------------------------------------------------------------------------------
// Init function
@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPMultARGBRow = WebPMultARGBRowC;
WebPMultRow = WebPMultRowC;
WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
WebPMultARGBRow = WebPMultARGBRow_C;
WebPMultRow = WebPMultRow_C;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
WebPPackRGB = PackRGB_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
WebPDispatchAlpha = DispatchAlpha_C;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
WebPExtractAlpha = ExtractAlpha_C;
WebPExtractGreen = ExtractGreen_C;
#endif
WebPHasAlpha8b = HasAlpha8b_C;
WebPHasAlpha32b = HasAlpha32b_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPInitAlphaProcessingNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitAlphaProcessingMIPSdspR2();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
}
#endif
assert(WebPMultARGBRow != NULL);
assert(WebPMultRow != NULL);
assert(WebPApplyAlphaMultiply != NULL);
assert(WebPApplyAlphaMultiply4444 != NULL);
assert(WebPDispatchAlpha != NULL);
assert(WebPDispatchAlphaToGreen != NULL);
assert(WebPExtractAlpha != NULL);
assert(WebPExtractGreen != NULL);
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -12,13 +12,13 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffff;
int i, j, temp0;
@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
int inverse) {
int x;
const uint32_t c_00ffffff = 0x00ffffffu;
const uint32_t c_ff000000 = 0xff000000u;
@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
}
}
static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, int step,
uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
WebPDispatchAlpha = DispatchAlpha;
WebPMultARGBRow = MultARGBRow;
WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
WebPMultARGBRow = MultARGBRow_MIPSdspR2;
WebPPackRGB = PackRGB_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,11 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include "./neon.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------

@ -11,16 +11,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
#undef MULTIPLIER
#undef PREMULTIPLY
//------------------------------------------------------------------------------
// Alpha detection
static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
const __m128i all_0xff = _mm_set1_epi8(0xff);
int i = 0;
for (; i + 16 <= length; i += 16) {
const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i < length; ++i) if (src[i] != 0xff) return 1;
return 0;
}
static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
const __m128i alpha_mask = _mm_set1_epi32(0xff);
const __m128i all_0xff = _mm_set1_epi8(0xff);
int i = 0;
// We don't know if we can access the last 3 bytes after the last alpha
// value 'src[4 * length - 4]' (because we don't know if alpha is the first
// or the last byte of the quadruplet). Hence the '-3' protection below.
length = length * 4 - 3; // size in bytes
for (; i + 64 <= length; i += 64) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i b2 = _mm_and_si128(a2, alpha_mask);
const __m128i b3 = _mm_and_si128(a3, alpha_mask);
const __m128i c0 = _mm_packs_epi32(b0, b1);
const __m128i c1 = _mm_packs_epi32(b2, b3);
const __m128i d = _mm_packus_epi16(c0, c1);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i + 32 <= length; i += 32) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i c = _mm_packs_epi32(b0, b1);
const __m128i d = _mm_packus_epi16(c, c);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
return 0;
}
// -----------------------------------------------------------------------------
// Apply alpha value to rows
@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
}
}
width -= x;
if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
}
}
width -= x;
if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
}
//------------------------------------------------------------------------------
@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPMultARGBRow = MultARGBRow_SSE2;
WebPMultRow = MultRow_SSE2;
WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha;
WebPDispatchAlpha = DispatchAlpha_SSE2;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
WebPExtractAlpha = ExtractAlpha_SSE2;
WebPHasAlpha8b = HasAlpha8b_SSE2;
WebPHasAlpha32b = HasAlpha32b_SSE2;
}
#else // !WEBP_USE_SSE2

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
extern void WebPInitAlphaProcessingSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
WebPExtractAlpha = ExtractAlpha;
WebPExtractAlpha = ExtractAlpha_SSE41;
}
#else // !WEBP_USE_SSE41

@ -57,6 +57,9 @@ static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
extern void VP8EncDspARGBInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
extern void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
const uint8_t*, int, uint32_t*);
VP8PackARGB = PackARGB;
}

@ -9,8 +9,8 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../enc/cost_enc.h"
#include "src/dsp/dsp.h"
#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_C(const int16_t* const coeffs,
VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
VP8GetResidualCost = GetResidualCost_C;
VP8SetResidualCoeffs = SetResidualCoeffs_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {

@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "../enc/cost_enc.h"
#include "src/enc/cost_enc.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
VP8GetResidualCost = GetResidualCost_MIPS32;
VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
}
#else // !WEBP_USE_MIPS32

@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../enc/cost_enc.h"
#include "src/enc/cost_enc.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
VP8GetResidualCost = GetResidualCost;
VP8GetResidualCost = GetResidualCost_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,19 +11,19 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "../utils/utils.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
res->coeffs = coeffs;
}
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
VP8GetResidualCost = GetResidualCostSSE2;
VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
VP8GetResidualCost = GetResidualCost_SSE2;
}
#else // !WEBP_USE_SSE2

@ -11,7 +11,7 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_HAVE_NEON_RTCD)
#include <stdio.h>
@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
return !!(cpu_info[2] & (1 << 0));
}
if (feature == kSlowSSSE3) {
if (is_intel && (cpu_info[2] & (1 << 0))) { // SSSE3?
if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
return CheckSlowModel(cpu_info[0]);
}
return 0;

@ -11,9 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include <assert.h>
#include "src/dsp/dsp.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
#define STORE2(y, dc, d, c) do { \
const int DC = (dc); \
@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
#define MUL2(a) (((a) * 35468) >> 16)
static void TransformOne(const int16_t* in, uint8_t* dst) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
const int c4 = MUL2(in[4]);
const int d4 = MUL1(in[4]);
@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
#undef MUL2
#undef STORE2
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne_C(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
TransformOne_C(in + 16, dst + 4);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV(const int16_t* in, uint8_t* dst) {
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
static void TransformDC(const int16_t* in, uint8_t* dst) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
}
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV(const int16_t* in, uint8_t* dst) {
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
//------------------------------------------------------------------------------
// Paragraph 14.3
static void TransformWHT(const int16_t* in, int16_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out += 64;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
#define DST(x, y) dst[(x) + (y) * BPS]
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const uint8_t* const clip0 = VP8kclip1 - top[-1];
@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
dst += BPS;
}
}
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); }
//------------------------------------------------------------------------------
// 16x16
static void VE16(uint8_t* dst) { // vertical
static void VE16_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 16; ++j) {
memcpy(dst + j * BPS, dst - BPS, 16);
}
}
static void HE16(uint8_t* dst) { // horizontal
static void HE16_C(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
memset(dst, dst[-1], 16);
@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
}
}
static void DC16(uint8_t* dst) { // DC
static void DC16_C(uint8_t* dst) { // DC
int DC = 16;
int j;
for (j = 0; j < 16; ++j) {
@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC
Put16(DC >> 5, dst);
}
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
Put16(DC >> 4, dst);
}
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available
int DC = 8;
int i;
for (i = 0; i < 16; ++i) {
@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
Put16(DC >> 4, dst);
}
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst) { // vertical
#if !WEBP_NEON_OMIT_C_CODE
static void VE4_C(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical
memcpy(dst + i * BPS, vals, sizeof(vals));
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HE4(uint8_t* dst) { // horizontal
static void HE4_C(uint8_t* dst) { // horizontal
const int A = dst[-1 - BPS];
const int B = dst[-1];
const int C = dst[-1 + BPS];
@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
}
static void DC4(uint8_t* dst) { // DC
#if !WEBP_NEON_OMIT_C_CODE
static void DC4_C(uint8_t* dst) { // DC
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
}
static void RD4(uint8_t* dst) { // Down-right
static void RD4_C(uint8_t* dst) { // Down-right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst) { // Down-Left
static void LD4_C(uint8_t* dst) { // Down-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void VR4(uint8_t* dst) { // Vertical-Right
static void VR4_C(uint8_t* dst) { // Vertical-Right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst) { // Vertical-Left
static void VL4_C(uint8_t* dst) { // Vertical-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst) { // Horizontal-Up
static void HU4_C(uint8_t* dst) { // Horizontal-Up
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst) { // Horizontal-Down
static void HD4_C(uint8_t* dst) { // Horizontal-Down
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
//------------------------------------------------------------------------------
// Chroma
static void VE8uv(uint8_t* dst) { // vertical
#if !WEBP_NEON_OMIT_C_CODE
static void VE8uv_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 8; ++j) {
memcpy(dst + j * BPS, dst - BPS, 8);
}
}
static void HE8uv(uint8_t* dst) { // horizontal
static void HE8uv_C(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
memset(dst, dst[-1], 8);
@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
}
}
static void DC8uv(uint8_t* dst) { // DC
static void DC8uv_C(uint8_t* dst) { // DC
int dc0 = 8;
int i;
for (i = 0; i < 8; ++i) {
@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC
Put8x8uv(dc0 >> 4, dst);
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// Edge filtering functions
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
// 4 pixels in, 2 pixels out
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
p[ 2*step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int needs_filter2(const uint8_t* p,
int step, int t, int it) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
const int p0 = p[-step], q0 = p[0];
const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
#if !WEBP_NEON_OMIT_C_CODE
static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i, stride, thresh2)) {
do_filter2(p + i, stride);
if (NeedsFilter_C(p + i, stride, thresh2)) {
DoFilter2_C(p + i, stride);
}
}
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i * stride, 1, thresh2)) {
do_filter2(p + i * stride, 1);
if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
DoFilter2_C(p + i * stride, 1);
}
}
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
SimpleVFilter16_C(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
SimpleHFilter16_C(p, stride, thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
do_filter6(p, hstride);
DoFilter6_C(p, hstride);
}
}
p += vstride;
}
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
do_filter4(p, hstride);
DoFilter4_C(p, hstride);
}
}
p += vstride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
static void VFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
static void HFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
VP8InitClipTables();
VP8TransformWHT = TransformWHT;
VP8Transform = TransformTwo;
VP8TransformUV = TransformUV;
VP8TransformDC = TransformDC;
VP8TransformDCUV = TransformDCUV;
VP8TransformAC3 = TransformAC3;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
VP8PredLuma4[0] = DC4;
VP8PredLuma4[1] = TM4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[3] = HE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[5] = VR4;
VP8PredLuma4[6] = LD4;
VP8PredLuma4[7] = VL4;
VP8PredLuma4[8] = HD4;
VP8PredLuma4[9] = HU4;
VP8PredLuma16[0] = DC16;
VP8PredLuma16[1] = TM16;
VP8PredLuma16[2] = VE16;
VP8PredLuma16[3] = HE16;
VP8PredLuma16[4] = DC16NoTop;
VP8PredLuma16[5] = DC16NoLeft;
VP8PredLuma16[6] = DC16NoTopLeft;
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TM8uv;
VP8PredChroma8[2] = VE8uv;
VP8PredChroma8[3] = HE8uv;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredChroma8[6] = DC8uvNoTopLeft;
VP8DitherCombine8x8 = DitherCombine8x8;
#if !WEBP_NEON_OMIT_C_CODE
VP8TransformWHT = TransformWHT_C;
VP8Transform = TransformTwo_C;
VP8TransformDC = TransformDC_C;
VP8TransformAC3 = TransformAC3_C;
#endif
VP8TransformUV = TransformUV_C;
VP8TransformDCUV = TransformDCUV_C;
#if !WEBP_NEON_OMIT_C_CODE
VP8VFilter16 = VFilter16_C;
VP8VFilter16i = VFilter16i_C;
VP8HFilter16 = HFilter16_C;
VP8VFilter8 = VFilter8_C;
VP8VFilter8i = VFilter8i_C;
VP8SimpleVFilter16 = SimpleVFilter16_C;
VP8SimpleHFilter16 = SimpleHFilter16_C;
VP8SimpleVFilter16i = SimpleVFilter16i_C;
VP8SimpleHFilter16i = SimpleHFilter16i_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8HFilter16i = HFilter16i_C;
VP8HFilter8 = HFilter8_C;
VP8HFilter8i = HFilter8i_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma4[0] = DC4_C;
VP8PredLuma4[1] = TM4_C;
VP8PredLuma4[2] = VE4_C;
VP8PredLuma4[4] = RD4_C;
VP8PredLuma4[6] = LD4_C;
#endif
VP8PredLuma4[3] = HE4_C;
VP8PredLuma4[5] = VR4_C;
VP8PredLuma4[7] = VL4_C;
VP8PredLuma4[8] = HD4_C;
VP8PredLuma4[9] = HU4_C;
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma16[0] = DC16_C;
VP8PredLuma16[1] = TM16_C;
VP8PredLuma16[2] = VE16_C;
VP8PredLuma16[3] = HE16_C;
VP8PredLuma16[4] = DC16NoTop_C;
VP8PredLuma16[5] = DC16NoLeft_C;
VP8PredLuma16[6] = DC16NoTopLeft_C;
VP8PredChroma8[0] = DC8uv_C;
VP8PredChroma8[1] = TM8uv_C;
VP8PredChroma8[2] = VE8uv_C;
VP8PredChroma8[3] = HE8uv_C;
VP8PredChroma8[4] = DC8uvNoTop_C;
VP8PredChroma8[5] = DC8uvNoLeft_C;
VP8PredChroma8[6] = DC8uvNoTopLeft_C;
#endif
VP8DitherCombine8x8 = DitherCombine8x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8DspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8DspInitMIPS32();
@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();
}
#endif
assert(VP8TransformWHT != NULL);
assert(VP8Transform != NULL);
assert(VP8TransformDC != NULL);
assert(VP8TransformAC3 != NULL);
assert(VP8TransformUV != NULL);
assert(VP8TransformDCUV != NULL);
assert(VP8VFilter16 != NULL);
assert(VP8HFilter16 != NULL);
assert(VP8VFilter8 != NULL);
assert(VP8HFilter8 != NULL);
assert(VP8VFilter16i != NULL);
assert(VP8HFilter16i != NULL);
assert(VP8VFilter8i != NULL);
assert(VP8HFilter8i != NULL);
assert(VP8SimpleVFilter16 != NULL);
assert(VP8SimpleHFilter16 != NULL);
assert(VP8SimpleVFilter16i != NULL);
assert(VP8SimpleHFilter16i != NULL);
assert(VP8PredLuma4[0] != NULL);
assert(VP8PredLuma4[1] != NULL);
assert(VP8PredLuma4[2] != NULL);
assert(VP8PredLuma4[3] != NULL);
assert(VP8PredLuma4[4] != NULL);
assert(VP8PredLuma4[5] != NULL);
assert(VP8PredLuma4[6] != NULL);
assert(VP8PredLuma4[7] != NULL);
assert(VP8PredLuma4[8] != NULL);
assert(VP8PredLuma4[9] != NULL);
assert(VP8PredLuma16[0] != NULL);
assert(VP8PredLuma16[1] != NULL);
assert(VP8PredLuma16[2] != NULL);
assert(VP8PredLuma16[3] != NULL);
assert(VP8PredLuma16[4] != NULL);
assert(VP8PredLuma16[5] != NULL);
assert(VP8PredLuma16[6] != NULL);
assert(VP8PredChroma8[0] != NULL);
assert(VP8PredChroma8[1] != NULL);
assert(VP8PredChroma8[2] != NULL);
assert(VP8PredChroma8[3] != NULL);
assert(VP8PredChroma8[4] != NULL);
assert(VP8PredChroma8[5] != NULL);
assert(VP8PredChroma8[6] != NULL);
assert(VP8DitherCombine8x8 != NULL);
dec_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -11,11 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#define USE_STATIC_TABLES // undefine to have run-time table initialization
// define to 0 to have run-time table initialization
#if !defined(USE_STATIC_TABLES)
#define USE_STATIC_TABLES 1 // ALTERNATE_CODE
#endif
#ifdef USE_STATIC_TABLES
#if (USE_STATIC_TABLES == 1)
static const uint8_t abs0[255 + 255 + 1] = {
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
#endif
#endif // USE_STATIC_TABLES
const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
const uint8_t* const VP8kabs0 = &abs0[255];
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
#if !defined(USE_STATIC_TABLES)
#if (USE_STATIC_TABLES == 0)
int i;
if (!tables_ok) {
for (i = -255; i <= 255; ++i) {

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./mips_macro.h"
#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;

@ -12,11 +12,11 @@
// Author(s): Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Transforms
@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
const v8i16 cnst9h = __msa_ldi_h(9); \
const v8i16 cnst63h = __msa_ldi_h(63); \
\
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
/* update q2/p2 */ \
temp0 = filt_r * cnst9h; \
temp1 = ADDVI_H(temp0, 63); \
temp1 = temp0 + cnst63h; \
temp2 = filt_l * cnst9h; \
temp3 = ADDVI_H(temp2, 63); \
temp3 = temp2 + cnst63h; \
FILT2(q2_m, p2_m, q2, p2); \
/* update q1/p1 */ \
temp1 = temp1 + temp0; \
@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) { // vertical
const uint32_t val0 = LW(ptop + 0);
const uint32_t val1 = LW(ptop + 4);
uint32_t out;
v16u8 A, B, C, AC, B2, R;
v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);
@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) { // Down-right
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
v16u8 A, B, C, AC, B2, R, A1;
v16u8 A, B, C, AC, B2, R, A1 = { 0 };
INSERT_W2_UB(val0, val1, A1);
A = SLDI_UB(A1, A1, 12);
@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) { // Down-Left
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
v16u8 A, B, C, AC, B2, R;
v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);

File diff suppressed because it is too large Load Diff

@ -12,23 +12,25 @@
// Author: somnath@google.com (Somnath Banerjee)
// cduvivier@google.com (Christian Duvivier)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
// one it seems => disable it by default. Uncomment the following to enable:
// #define USE_TRANSFORM_AC3
#if !defined(USE_TRANSFORM_AC3)
#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE
#endif
#include <emmintrin.h>
#include "./common_sse2.h"
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dsp/common_sse2.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
}
}
#if defined(USE_TRANSFORM_AC3)
#if (USE_TRANSFORM_AC3 == 1)
#define MUL(a, b) (((a) * (b)) >> 16)
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static const int kC1 = 20091 + (1 << 16);
@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
_mm_subs_epu8((p), (q)))
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
static WEBP_INLINE void SignedShift8b(__m128i* const x) {
static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
const __m128i zero = _mm_setzero_si128();
const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
#define FLIP_SIGN_BIT2(a, b) { \
a = _mm_xor_si128(a, sign_bit); \
b = _mm_xor_si128(b, sign_bit); \
(a) = _mm_xor_si128(a, sign_bit); \
(b) = _mm_xor_si128(b, sign_bit); \
}
#define FLIP_SIGN_BIT4(a, b, c, d) { \
@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
// input/output is uint8_t
static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int hev_thresh, __m128i* const not_hev) {
static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int hev_thresh, __m128i* const not_hev) {
const __m128i zero = _mm_setzero_si128();
const __m128i t_1 = MM_ABS(*p1, *p0);
const __m128i t_2 = MM_ABS(*q1, *q0);
@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
}
// input pixels are int8_t
static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
__m128i* const delta) {
static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
__m128i* const delta) {
// beware of addition order, for saturation!
const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0
@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
}
// input and output are int8_t
static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
const __m128i* const fl) {
static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
__m128i* const q0,
const __m128i* const fl) {
const __m128i k3 = _mm_set1_epi8(3);
const __m128i k4 = _mm_set1_epi8(4);
__m128i v3 = _mm_adds_epi8(*fl, k3);
__m128i v4 = _mm_adds_epi8(*fl, k4);
SignedShift8b(&v4); // v4 >> 3
SignedShift8b(&v3); // v3 >> 3
SignedShift8b_SSE2(&v4); // v4 >> 3
SignedShift8b_SSE2(&v3); // v3 >> 3
*q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4
*p0 = _mm_adds_epi8(*p0, v3); // p0 += v3
}
@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
const __m128i* const a0_lo,
const __m128i* const a0_hi) {
static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
const __m128i* const a0_lo,
const __m128i* const a0_hi) {
const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
}
// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, __m128i* const mask) {
static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, __m128i* const mask) {
const __m128i m_thresh = _mm_set1_epi8(thresh);
const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
const __m128i kFE = _mm_set1_epi8(0xFE);
@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
// Edge filtering functions
// Applies filter on 2 pixels (p0 and q0)
static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
int thresh) {
static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
int thresh) {
__m128i a, mask;
const __m128i sign_bit = _mm_set1_epi8(0x80);
// convert p1/q1 to int8_t (for GetBaseDelta)
// convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
NeedsFilter(p1, p0, q0, q1, thresh, &mask);
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
FLIP_SIGN_BIT2(*p0, *q0);
GetBaseDelta(&p1s, p0, q0, &q1s, &a);
GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
a = _mm_and_si128(a, mask); // mask filter values we don't care about
DoSimpleFilter(p0, q0, &a);
DoSimpleFilter_SSE2(p0, q0, &a);
FLIP_SIGN_BIT2(*p0, *q0);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
const __m128i* const mask, int hev_thresh) {
static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
const __m128i k64 = _mm_set1_epi8(64);
@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
__m128i t1, t2, t3;
// compute hev mask
GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3
t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4
SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
*q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3
FLIP_SIGN_BIT2(*p0, *q0);
@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
__m128i* const p0, __m128i* const q0,
__m128i* const q1, __m128i* const q2,
const __m128i* const mask, int hev_thresh) {
static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
__m128i* const p0, __m128i* const q0,
__m128i* const q1, __m128i* const q2,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
__m128i a, not_hev;
// compute hev mask
GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
GetBaseDelta(p1, p0, q0, q1, &a);
GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
{ // do simple filter on pixels with hev
const __m128i m = _mm_andnot_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
DoSimpleFilter(p0, q0, &f);
DoSimpleFilter_SSE2(p0, q0, &f);
}
{ // do strong filter on pixels with not hev
@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63
const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63
Update2Pixels(p2, q2, &a2_lo, &a2_hi);
Update2Pixels(p1, q1, &a1_lo, &a1_hi);
Update2Pixels(p0, q0, &a0_lo, &a0_hi);
Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
}
}
// reads 8 rows across a vertical edge.
static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
__m128i* const p, __m128i* const q) {
static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
__m128i* const p, __m128i* const q) {
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
const __m128i A0 = _mm_set_epi32(
@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
*q = _mm_unpackhi_epi32(C0, C1);
}
static WEBP_INLINE void Load16x4(const uint8_t* const r0,
const uint8_t* const r8,
int stride,
__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1) {
static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
const uint8_t* const r8,
int stride,
__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1) {
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4(r0, stride, p1, q0);
Load8x4(r8, stride, p0, q1);
Load8x4_SSE2(r0, stride, p1, q0);
Load8x4_SSE2(r8, stride, p0, q1);
{
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
}
}
static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
}
// Transpose back and store
static WEBP_INLINE void Store16x4(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
uint8_t* r0, uint8_t* r8,
int stride) {
static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
uint8_t* r0, uint8_t* r8,
int stride) {
__m128i t1, p1_s, p0_s, q0_s, q1_s;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
p1_s = _mm_unpacklo_epi16(t1, q1_s);
q1_s = _mm_unpackhi_epi16(t1, q1_s);
Store4x4(&p0_s, r0, stride);
Store4x4_SSE2(&p0_s, r0, stride);
r0 += 4 * stride;
Store4x4(&q0_s, r0, stride);
Store4x4_SSE2(&q0_s, r0, stride);
Store4x4(&p1_s, r8, stride);
Store4x4_SSE2(&p1_s, r8, stride);
r8 += 4 * stride;
Store4x4(&q1_s, r8, stride);
Store4x4_SSE2(&q1_s, r8, stride);
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
// Load
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
__m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
__m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-stride], p0);
_mm_storeu_si128((__m128i*)&p[0], q0);
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
__m128i p1, p0, q0, q1;
p -= 2; // beginning of p1
Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
SimpleVFilter16_SSE2(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
SimpleHFilter16_SSE2(p, stride, thresh);
}
}
@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
// Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
m = MM_ABS(p1, p0); \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
(m) = MM_ABS(p1, p0); \
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
(m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
(e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \
(e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \
(e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \
(e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \
}
#define LOADUV_H_EDGE(p, u, v, stride) do { \
const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
p = _mm_unpacklo_epi64(U, V); \
(p) = _mm_unpacklo_epi64(U, V); \
} while (0)
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
LOADUV_H_EDGE(e2, u, v, 1 * stride); \
LOADUV_H_EDGE(e3, u, v, 2 * stride); \
LOADUV_H_EDGE(e4, u, v, 3 * stride); \
LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \
LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \
LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \
LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \
}
#define STOREUV(p, u, v, stride) { \
_mm_storel_epi64((__m128i*)&u[(stride)], p); \
p = _mm_srli_si128(p, 8); \
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
_mm_storel_epi64((__m128i*)&(u)[(stride)], p); \
(p) = _mm_srli_si128(p, 8); \
_mm_storel_epi64((__m128i*)&(v)[(stride)], p); \
}
static WEBP_INLINE void ComplexMask(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, int ithresh,
__m128i* const mask) {
static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, int ithresh,
__m128i* const mask) {
const __m128i it = _mm_set1_epi8(ithresh);
const __m128i diff = _mm_subs_epu8(*mask, it);
const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
__m128i filter_mask;
NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
*mask = _mm_and_si128(thresh_mask, filter_mask);
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i t1;
__m128i mask;
__m128i p2, p1, p0, q0, q1, q2;
@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
_mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter16_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const b = p - 4;
Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16i_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
// p3 and p2 are not just temporary variables here: they will be
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter16i_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
for (k = 3; k > 0; --k) {
__m128i mask, tmp1, tmp2;
@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
p += 4; // beginning of q0 (and next span)
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
// rotate samples
p1 = tmp1;
@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
STOREUV(p2, u, v, -3 * stride);
@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const tu = u - 4;
uint8_t* const tv = v - 4;
Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
STOREUV(p1, u, v, -2 * stride);
@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4; // beginning of q0
v += 4;
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
}
//------------------------------------------------------------------------------
@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static void VE4(uint8_t* dst) { // vertical
static void VE4_SSE2(uint8_t* dst) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical
}
}
static void LD4(uint8_t* dst) { // Down-Left
static void LD4_SSE2(uint8_t* dst) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static void VR4(uint8_t* dst) { // Vertical-Right
static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(0, 3) = AVG3(K, J, I);
}
static void VL4(uint8_t* dst) { // Vertical-Left
static void VL4_SSE2(uint8_t* dst) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = (extra_out >> 8) & 0xff;
}
static void RD4(uint8_t* dst) { // Down-right
static void RD4_SSE2(uint8_t* dst) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right
//------------------------------------------------------------------------------
// Luma 16x16
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const __m128i zero = _mm_setzero_si128();
int y;
@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
}
}
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); }
static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); }
static void VE16(uint8_t* dst) {
static void VE16_SSE2(uint8_t* dst) {
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
int j;
for (j = 0; j < 16; ++j) {
@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
}
}
static void HE16(uint8_t* dst) { // horizontal
static void HE16_SSE2(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
const __m128i values = _mm_set1_epi8(dst[-1]);
@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal
}
}
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
static void DC16(uint8_t* dst) { // DC
static void DC16_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
Put16(DC >> 5, dst);
Put16_SSE2(DC >> 5, dst);
}
}
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
DC += dst[-1 + j * BPS];
}
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
const int DC = _mm_cvtsi128_si32(sum) + 8;
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples
Put16_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
// Chroma
static void VE8uv(uint8_t* dst) { // vertical
static void VE8uv_SSE2(uint8_t* dst) { // vertical
int j;
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
for (j = 0; j < 8; ++j) {
@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical
}
}
static void HE8uv(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8(dst[-1]);
_mm_storel_epi64((__m128i*)dst, values);
dst += BPS;
}
}
// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
static void DC8uv(uint8_t* dst) { // DC
static void DC8uv_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
Put8x8uv(DC >> 4, dst);
Put8x8uv_SSE2(DC >> 4, dst);
}
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
Put8x8uv(DC >> 3, dst);
Put8x8uv_SSE2(DC >> 3, dst);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[-1 + i * BPS];
}
Put8x8uv(dc0 >> 3, dst);
Put8x8uv_SSE2(dc0 >> 3, dst);
}
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing
Put8x8uv_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
extern void VP8DspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
VP8Transform = Transform;
#if defined(USE_TRANSFORM_AC3)
VP8TransformAC3 = TransformAC3;
VP8Transform = Transform_SSE2;
#if (USE_TRANSFORM_AC3 == 1)
VP8TransformAC3 = TransformAC3_SSE2;
#endif
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
VP8PredLuma4[1] = TM4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[5] = VR4;
VP8PredLuma4[6] = LD4;
VP8PredLuma4[7] = VL4;
VP8PredLuma16[0] = DC16;
VP8PredLuma16[1] = TM16;
VP8PredLuma16[2] = VE16;
VP8PredLuma16[3] = HE16;
VP8PredLuma16[4] = DC16NoTop;
VP8PredLuma16[5] = DC16NoLeft;
VP8PredLuma16[6] = DC16NoTopLeft;
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TM8uv;
VP8PredChroma8[2] = VE8uv;
VP8PredChroma8[3] = HE8uv;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredChroma8[6] = DC8uvNoTopLeft;
VP8VFilter16 = VFilter16_SSE2;
VP8HFilter16 = HFilter16_SSE2;
VP8VFilter8 = VFilter8_SSE2;
VP8HFilter8 = HFilter8_SSE2;
VP8VFilter16i = VFilter16i_SSE2;
VP8HFilter16i = HFilter16i_SSE2;
VP8VFilter8i = VFilter8i_SSE2;
VP8HFilter8i = HFilter8i_SSE2;
VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
VP8PredLuma4[1] = TM4_SSE2;
VP8PredLuma4[2] = VE4_SSE2;
VP8PredLuma4[4] = RD4_SSE2;
VP8PredLuma4[5] = VR4_SSE2;
VP8PredLuma4[6] = LD4_SSE2;
VP8PredLuma4[7] = VL4_SSE2;
VP8PredLuma16[0] = DC16_SSE2;
VP8PredLuma16[1] = TM16_SSE2;
VP8PredLuma16[2] = VE16_SSE2;
VP8PredLuma16[3] = HE16_SSE2;
VP8PredLuma16[4] = DC16NoTop_SSE2;
VP8PredLuma16[5] = DC16NoLeft_SSE2;
VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
VP8PredChroma8[0] = DC8uv_SSE2;
VP8PredChroma8[1] = TM8uv_SSE2;
VP8PredChroma8[2] = VE8uv_SSE2;
VP8PredChroma8[4] = DC8uvNoTop_SSE2;
VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
}
#else // !WEBP_USE_SSE2

@ -11,15 +11,15 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
static void HE16(uint8_t* dst) { // horizontal
static void HE16_SSE41(uint8_t* dst) { // horizontal
int j;
const __m128i kShuffle3 = _mm_set1_epi8(3);
for (j = 16; j > 0; --j) {
@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal
extern void VP8DspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
VP8PredLuma16[3] = HE16;
VP8PredLuma16[3] = HE16_SSE41;
}
#else // !WEBP_USE_SSE41

@ -15,10 +15,10 @@
#define WEBP_DSP_DSP_H_
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include "../webp/types.h"
#include "src/webp/types.h"
#ifdef __cplusplus
extern "C" {
@ -38,10 +38,22 @@ extern "C" {
# define LOCAL_GCC_PREREQ(maj, min) 0
#endif
#if defined(__clang__)
# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
# define LOCAL_CLANG_PREREQ(maj, min) \
(LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
#else
# define LOCAL_CLANG_VERSION 0
# define LOCAL_CLANG_PREREQ(maj, min) 0
#endif
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
// for now, none of the optimizations below are available in emscripten
#if !defined(EMSCRIPTEN)
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
@ -68,18 +80,20 @@ extern "C" {
#define WEBP_USE_AVX2
#endif
#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
#define WEBP_ANDROID_NEON // Android targets that might support NEON
#endif
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
#if (defined(__ARM_NEON__) || \
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
#define WEBP_ANDROID_NEON // Android targets that may have NEON
#define WEBP_USE_NEON
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
#define WEBP_USE_NEON
#define WEBP_USE_INTRINSICS
@ -90,7 +104,7 @@ extern "C" {
#define WEBP_USE_MIPS32
#if (__mips_isa_rev >= 2)
#define WEBP_USE_MIPS32_R2
#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
#define WEBP_USE_MIPS_DSP_R2
#endif
#endif
@ -100,6 +114,24 @@ extern "C" {
#define WEBP_USE_MSA
#endif
#endif /* EMSCRIPTEN */
#ifndef WEBP_DSP_OMIT_C_CODE
#define WEBP_DSP_OMIT_C_CODE 1
#endif
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
#endif
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WEBP_NEON_WORK_AROUND_GCC 1
#else
#define WEBP_NEON_WORK_AROUND_GCC 0
#endif
// This macro prevents thread_sanitizer from reporting known concurrent writes.
#define WEBP_TSAN_IGNORE_FUNCTION
#if defined(__has_feature)
@ -129,6 +161,11 @@ extern "C" {
#endif
#endif
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define WEBP_SWAP_16BIT_CSP 0
#endif
typedef enum {
kSSE2,
kSSE3,
@ -143,7 +180,7 @@ typedef enum {
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
//------------------------------------------------------------------------------
// Init stub generator
@ -271,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
int xo, int yo, // center position
int W, int H); // plane dimension
#if !defined(WEBP_REDUCE_SIZE)
// This version is called with the guarantee that you can load 8 bytes and
// 8 rows at offset src1 and src2
typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@ -278,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked
extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping
#endif
#if !defined(WEBP_DISABLE_STATS)
typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
const uint8_t* src2, int len);
extern VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
// must be called before using any of the above directly
void VP8SSIMDspInit(void);
@ -462,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back.
extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@ -533,24 +574,21 @@ void WebPMultRows(uint8_t* ptr, int stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
// To be called first before using the above.
void WebPInitAlphaProcessing(void);
// ARGB packing function: a/r/g/b input is rgba or bgra order.
extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out);
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
// This function returns true if src[4*i] contains a value different from 0xff.
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
// To be called first before using the above.
void VP8EncDspARGBInit(void);
void WebPInitAlphaProcessing(void);
//------------------------------------------------------------------------------
// Filter functions

@ -14,16 +14,18 @@
#include <assert.h>
#include <stdlib.h> // for abs()
#include "./dsp.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
histo->last_non_zero = last_non_zero;
}
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
}
VP8SetHistogramData(distribution, histo);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#if !WEBP_NEON_OMIT_C_CODE
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
}
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
out[12 + i] = b3 >> 1;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
return count;
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
};
// Simple quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (last >= 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
@ -682,148 +698,14 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
}
}
static void Copy4x4(const uint8_t* src, uint8_t* dst) {
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8(const uint8_t* src, uint8_t* dst) {
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
//------------------------------------------------------------------------------
static uint32_t AccumulateSSE(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
//------------------------------------------------------------------------------
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
VP8AccumulateSSEFunc VP8AccumulateSSE;
extern void VP8SSIMDspInitSSE2(void);
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
(VP8CPUInfo)&ssim_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
VP8AccumulateSSE = AccumulateSSE;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
ssim_last_cpuinfo_used = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------
// Initialization
@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransform2 = FTransform2;
VP8FTransformWHT = FTransformWHT;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8Mean16x4 = Mean16x4;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlock;
VP8Copy4x4 = Copy4x4;
VP8Copy16x8 = Copy16x8;
#if !WEBP_NEON_OMIT_C_CODE
VP8ITransform = ITransform_C;
VP8FTransform = FTransform_C;
VP8FTransformWHT = FTransformWHT_C;
VP8TDisto4x4 = Disto4x4_C;
VP8TDisto16x16 = Disto16x16_C;
VP8CollectHistogram = CollectHistogram_C;
VP8SSE16x16 = SSE16x16_C;
VP8SSE16x8 = SSE16x8_C;
VP8SSE8x8 = SSE8x8_C;
VP8SSE4x4 = SSE4x4_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
VP8EncDspInitAVX2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
}
#endif
assert(VP8ITransform != NULL);
assert(VP8FTransform != NULL);
assert(VP8FTransformWHT != NULL);
assert(VP8TDisto4x4 != NULL);
assert(VP8TDisto16x16 != NULL);
assert(VP8CollectHistogram != NULL);
assert(VP8SSE16x16 != NULL);
assert(VP8SSE16x8 != NULL);
assert(VP8SSE8x8 != NULL);
assert(VP8SSE4x4 != NULL);
assert(VP8EncQuantizeBlock != NULL);
assert(VP8EncQuantize2Blocks != NULL);
assert(VP8FTransform2 != NULL);
assert(VP8EncPredLuma4 != NULL);
assert(VP8EncPredLuma16 != NULL);
assert(VP8EncPredChroma8 != NULL);
assert(VP8Mean16x4 != NULL);
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
enc_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -9,7 +9,7 @@
//
// AVX2 version of speed-critical encoding functions.
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_AVX2)

@ -13,13 +13,13 @@
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
#include "../enc/vp8i_enc.h"
#include "../enc/cost_enc.h"
#include "src/dsp/mips_macro.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/cost_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@ -113,8 +113,9 @@ static const int kC2 = 35468;
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
const int16_t* in,
uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
static void ITransform(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
}
}
@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
}
}
return D;
@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8ITransform = ITransform_MIPS32;
VP8FTransform = FTransform_MIPS32;
VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
VP8TDisto4x4 = Disto4x4_MIPS32;
VP8TDisto16x16 = Disto16x16_MIPS32;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8SSE16x16 = SSE16x16_MIPS32;
VP8SSE8x8 = SSE8x8_MIPS32;
VP8SSE16x8 = SSE16x8_MIPS32;
VP8SSE4x4 = SSE4x4_MIPS32;
#endif
}

@ -12,13 +12,13 @@
// Author(s): Darko Laus (darko.laus@imgtec.com)
// Mirko Raus (mirko.raus@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./mips_macro.h"
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/mips_macro.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@ -141,7 +141,8 @@ static const int kC2 = 35468;
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int c2217 = 2217;
const int c5352 = 5352;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
return abs(temp3 - temp17) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPSdspR2(const uint8_t* const a,
const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
}
}
return D;
@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8(C8DC8 + dst, left, top);
VerticalPred8(C8VE8 + dst, top);
@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MIPSdspR2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16(I16DC16 + dst, left, top);
VerticalPred16(I16VE16 + dst, top);
HorizontalPred16(I16HE16 + dst, left);
@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
GET_SSE_INNER(C) \
GET_SSE_INNER(D)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
"usw $0, " #J "(%[ppin]) \n\t" \
"3: \n\t"
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
int sign, coeff, level;
int max_level = MAX_LEVEL;
@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (ret != 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
"addiu %[temp8], %[temp8], 1 \n\t" \
"sw %[temp8], 0(%[temp3]) \n\t"
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
extern void VP8EncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
VP8FTransform = FTransform;
VP8ITransform = ITransform;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8EncPredLuma4 = Intra4Preds;
VP8FTransform = FTransform_MIPSdspR2;
VP8FTransformWHT = FTransformWHT_MIPSdspR2;
VP8ITransform = ITransform_MIPSdspR2;
VP8TDisto4x4 = Disto4x4_MIPSdspR2;
VP8TDisto16x16 = Disto16x16_MIPSdspR2;
VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8SSE16x16 = SSE16x16_MIPSdspR2;
VP8SSE8x8 = SSE8x8_MIPSdspR2;
VP8SSE16x8 = SSE16x8_MIPSdspR2;
VP8SSE4x4 = SSE4x4_MIPSdspR2;
#endif
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8FTransformWHT = FTransformWHT;
VP8CollectHistogram = CollectHistogram;
VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
VP8CollectHistogram = CollectHistogram_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,13 +11,13 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include <stdlib.h>
#include "./msa_macro.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/msa_macro.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms
@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
v16u8 srcl0, srcl1, src0, src1;
v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
ST_SH2(out0, out1, out, 8);
}
static int TTransform(const uint8_t* in, const uint16_t* w) {
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0;
v16i8 src0 = { 0 };
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
v4i32 dst0, dst1;
const v16i8 zero = { 0 };
@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MSA(a + x + y, b + x + y, w);
}
}
return D;
@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
// Histogram
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C1 = SLDI_UB(A, A, 2);
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
#undef AVG3
#undef AVG2
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
STORE16x16(out, dst);
}
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MSA(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint64_t out;
v16u8 src;
v16u8 src = { 0 };
if (top != NULL && left != NULL) {
const uint64_t left_m = LD(left);
const uint64_t top_m = LD(top);
@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
STORE8x8(out, dst);
}
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src, ref, tmp0, tmp1;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
v8i16 diff0, diff1;
v4i32 out0, out1;
@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
// Quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
tmp1 = (tmp3 > maxlevel);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (sum > 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransformWHT = FTransformWHT;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8SSE16x16 = SSE16x16;
VP8SSE16x8 = SSE16x8;
VP8SSE8x8 = SSE8x8;
VP8SSE4x4 = SSE4x4;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlock;
VP8ITransform = ITransform_MSA;
VP8FTransform = FTransform_MSA;
VP8FTransformWHT = FTransformWHT_MSA;
VP8TDisto4x4 = Disto4x4_MSA;
VP8TDisto16x16 = Disto16x16_MSA;
VP8CollectHistogram = CollectHistogram_MSA;
VP8EncPredLuma4 = Intra4Preds_MSA;
VP8EncPredLuma16 = Intra16Preds_MSA;
VP8EncPredChroma8 = IntraChromaPreds_MSA;
VP8SSE16x16 = SSE16x16_MSA;
VP8SSE16x8 = SSE16x8_MSA;
VP8SSE8x8 = SSE8x8_MSA;
VP8SSE4x4 = SSE4x4_MSA;
VP8EncQuantizeBlock = QuantizeBlock_MSA;
VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
}
#else // !WEBP_USE_MSA

@ -11,14 +11,14 @@
//
// adapted from libvpx (http://www.webmproject.org/code/)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "./neon.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/neon.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
@ -37,15 +37,15 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
const uint8_t* const ref, uint8_t* const dst) {
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
const uint8_t* const ref,
uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
{
// Convert to 16b.
const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
SaturateAndStore4x4(dst, out01, out23);
SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
int16x8x2_t* const out) {
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
const int16x8_t in1,
int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2(E0, E1, rows);
Transpose8x2_NEON(E0, E1, rows);
}
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass(&rows);
TransformPass(&rows);
Add4x4(rows.val[0], rows.val[1], ref, dst);
TransformPass_NEON(&rows);
TransformPass_NEON(&rows);
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
}
#else
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
#endif // WEBP_USE_INTRINSICS
static void ITransform(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
static void ITransform_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne_NEON(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
static uint8x16_t Load4x4(const uint8_t* src) {
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
const int16x4_t C, const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
const int16x4_t B,
const int16x4_t C,
const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
vreinterpret_s64_s32(tmp02.val[1])));
}
static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
const uint8x8_t b) {
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4(src);
const uint8x16_t R0 = Load4x4(ref);
const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
const uint8x16_t S0 = Load4x4_NEON(src);
const uint8x16_t R0 = Load4x4_NEON(ref);
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
51000, 51000, 51000, 51000
};
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
src += stride; \
} while (0)
static void FTransformWHT(const int16_t* src, int16_t* out) {
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
const int32x4x4_t tmp1 = Transpose4x4(tmp0);
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
// a 26ae, b 26ae
// a 37bf, b 37bf
//
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
return q4_in;
}
static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
const int16x8x4_t q4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
return q4_out;
}
static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
q4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
return q4_out;
}
static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
return d4_w;
}
static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent
// transpose.
const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
const int16x4x4_t d4_w = DistoLoadW(w);
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
// horizontal pass
const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
int32x2_t d_sum = DistoSum(q4_h, d4_w);
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
}
#undef LOAD_LANE_32b
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_NEON(a + x + y, b + x + y, w);
}
}
return D;
@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
}
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt(uint32x4_t sum) {
static int SumToInt_NEON(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
const uint8x16_t a0 = Load4x4(a);
const uint8x16_t b0 = Load4x4(b);
const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
return SumToInt(vaddq_u32(sum1, sum2));
return SumToInt_NEON(vaddq_u32(sum1, sum2));
}
//------------------------------------------------------------------------------
@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
static int16x8_t Quantize_NEON(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize(in, mtx, 0);
const int16x8_t out1 = Quantize(in, mtx, 8);
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8ITransform = ITransform_NEON;
VP8FTransform = FTransform_NEON;
VP8FTransformWHT = FTransformWHT;
VP8FTransformWHT = FTransformWHT_NEON;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram;
VP8TDisto4x4 = Disto4x4_NEON;
VP8TDisto16x16 = Disto16x16_NEON;
VP8CollectHistogram = CollectHistogram_NEON;
VP8SSE16x16 = SSE16x16_NEON;
VP8SSE16x8 = SSE16x8_NEON;
@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE4x4 = SSE4x4_NEON;
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
#endif
}

@ -11,23 +11,23 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <stdlib.h> // for abs()
#include <emmintrin.h>
#include "./common_sse2.h"
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/common_sse2.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
static void FTransformPass1(const __m128i* const in01,
const __m128i* const in23,
__m128i* const out01,
__m128i* const out32) {
static void FTransformPass1_SSE2(const __m128i* const in01,
const __m128i* const in23,
__m128i* const out01,
__m128i* const out32) {
const __m128i k937 = _mm_set1_epi32(937);
const __m128i k1812 = _mm_set1_epi32(1812);
@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
*out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
}
static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
int16_t* out) {
static void FTransformPass2_SSE2(const __m128i* const v01,
const __m128i* const v32,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src.
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01, v32;
// First pass
FTransformPass1(&row01, &row23, &v01, &v32);
FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
// Second pass
FTransformPass2(&v01, &v32, out);
FTransformPass2_SSE2(&v01, &v32, out);
}
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src and convert to 16b.
@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01h, v32h;
// First pass
FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
// Second pass
FTransformPass2(&v01l, &v32l, out + 0);
FTransformPass2(&v01h, &v32h, out + 16);
FTransformPass2_SSE2(&v01l, &v32l, out + 0);
FTransformPass2_SSE2(&v01h, &v32h, out + 16);
}
static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
*out = _mm_madd_epi16(D, kMult);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
// Input is 12b signed.
__m128i row0, row1, row2, row3;
// Rows are 14b signed.
FTransformWHTRow(in + 0 * 64, &row0);
FTransformWHTRow(in + 1 * 64, &row1);
FTransformWHTRow(in + 2 * 64, &row2);
FTransformWHTRow(in + 3 * 64, &row3);
FTransformWHTRow_SSE2(in + 0 * 64, &row0);
FTransformWHTRow_SSE2(in + 1 * 64, &row1);
FTransformWHTRow_SSE2(in + 2 * 64, &row2);
FTransformWHTRow_SSE2(in + 3 * 64, &row3);
{
// The a* are 15b signed.
@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i zero = _mm_setzero_si128();
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int16_t out[16];
int k;
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Intra predictions
// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
if (size == 4) {
int j;
for (j = 0; j < 4; ++j) {
memset(dst + j * BPS, value, 4);
}
} else if (size == 8) {
Put8x8uv(value, dst);
Put8x8uv_SSE2(value, dst);
} else {
Put16(value, dst);
Put16_SSE2(value, dst);
}
}
static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
int j;
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
for (j = 0; j < 8; ++j) {
@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_values = _mm_load_si128((const __m128i*)top);
int j;
for (j = 0; j < 16; ++j) {
@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
const uint8_t* top, int size) {
if (top != NULL) {
if (size == 8) {
VE8uv(dst, top);
VE8uv_SSE2(dst, top);
} else {
VE16(dst, top);
VE16_SSE2(dst, top);
}
} else {
Fill(dst, 127, size);
Fill_SSE2(dst, 127, size);
}
}
static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 16; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
const uint8_t* left, int size) {
if (left != NULL) {
if (size == 8) {
HE8uv(dst, left);
HE8uv_SSE2(dst, left);
} else {
HE16(dst, left);
HE16_SSE2(dst, left);
}
} else {
Fill(dst, 129, size);
Fill_SSE2(dst, 129, size);
}
}
static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
const __m128i zero = _mm_setzero_si128();
int y;
if (size == 8) {
@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
if (left != NULL) {
if (top != NULL) {
TM(dst, left, top, size);
TM_SSE2(dst, left, top, size);
} else {
HorizontalPred(dst, left, size);
HorizontalPred_SSE2(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred(dst, top, size);
VerticalPred_SSE2(dst, top, size);
} else {
Fill(dst, 129, size);
Fill_SSE2(dst, 129, size);
}
}
}
static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
const int DC = VP8HorizontalAdd8b(&combined) + 8;
Put8x8uv(DC >> 4, dst);
Put8x8uv_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i sum = _mm_sad_epu8(top_values, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
Put8x8uv(DC >> 3, dst);
Put8x8uv_SSE2(DC >> 3, dst);
}
static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
DC8uvNoLeft(dst, left);
DC8uvNoLeft_SSE2(dst, left);
}
static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
Put8x8uv(0x80, dst);
static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
Put8x8uv_SSE2(0x80, dst);
}
static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC8uv(dst, left, top);
DC8uv_SSE2(dst, left, top);
} else { // top, but no left
DC8uvNoLeft(dst, top);
DC8uvNoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
DC8uvNoTop(dst, left);
DC8uvNoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
DC8uvNoTopLeft(dst);
DC8uvNoTopLeft_SSE2(dst);
}
}
static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const __m128i left_row = _mm_load_si128((const __m128i*)left);
const int DC =
VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
Put16(DC >> 5, dst);
Put16_SSE2(DC >> 5, dst);
}
static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const int DC = VP8HorizontalAdd8b(&top_row) + 8;
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
DC16NoLeft(dst, left);
DC16NoLeft_SSE2(dst, left);
}
static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
Put16(0x80, dst);
static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
Put16_SSE2(0x80, dst);
}
static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC16(dst, left, top);
DC16_SSE2(dst, left, top);
} else { // top, but no left
DC16NoLeft(dst, top);
DC16NoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
DC16NoTop(dst, left);
DC16NoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
DC16NoTopLeft(dst);
DC16NoTopLeft_SSE2(dst);
}
}
@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
const uint8_t* top) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
}
}
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
Fill_SSE2(dst, dc >> 3, 4);
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void VR4(uint8_t* dst,
const uint8_t* top) { // Vertical-Right
static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = top[-2];
const int J = top[-3];
@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
DST(0, 3) = AVG3(K, J, I);
}
static WEBP_INLINE void VL4(uint8_t* dst,
const uint8_t* top) { // Vertical-Left
static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
DST(3, 3) = (extra_out >> 8) & 0xff;
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
DC4_SSE2(I4DC4 + dst, top);
TM4_SSE2(I4TM4 + dst, top);
VE4_SSE2(I4VE4 + dst, top);
HE4_SSE2(I4HE4 + dst, top);
RD4_SSE2(I4RD4 + dst, top);
VR4_SSE2(I4VR4 + dst, top);
LD4_SSE2(I4LD4 + dst, top);
VL4_SSE2(I4VL4 + dst, top);
HD4_SSE2(I4HD4 + dst, top);
HU4_SSE2(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DC8uvMode(C8DC8 + dst, left, top);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8);
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DC8uvMode(C8DC8 + dst, left, top);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8);
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DC16Mode(I16DC16 + dst, left, top);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
static void Intra16Preds_SSE2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DC16Mode_SSE2(I16DC16 + dst, left, top);
VerticalPred_SSE2(I16VE16 + dst, top, 16);
HorizontalPred_SSE2(I16HE16 + dst, left, 16);
TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// Metric
static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
__m128i* const sum) {
static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
*sum = _mm_add_epi32(sum1, sum2);
}
static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
int num_pairs) {
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
int num_pairs) {
__m128i sum = _mm_setzero_si128();
int32_t tmp[4];
int i;
@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
__m128i sum1, sum2;
SubtractAndAccumulate(a0, b0, &sum1);
SubtractAndAccumulate(a1, b1, &sum2);
SubtractAndAccumulate_SSE2(a0, b0, &sum1);
SubtractAndAccumulate_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
a += 2 * BPS;
b += 2 * BPS;
@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return SSE_16xN(a, b, 8);
static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
return SSE_16xN_SSE2(a, b, 8);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return SSE_16xN(a, b, 4);
static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
return SSE_16xN_SSE2(a, b, 4);
}
#define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
int num_pairs = 4;
__m128i sum = zero;
@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
}
#undef LOAD_8x16b
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4,
@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
const __m128i mask = _mm_set1_epi16(0x00ff);
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform(a, b, w);
static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE2(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_SSE2(a + x + y, b + x + y, w);
}
}
return D;
@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, NULL, mtx);
static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
VP8CollectHistogram = CollectHistogram;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8EncPredLuma4 = Intra4Preds;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransform2 = FTransform2;
VP8FTransformWHT = FTransformWHT;
VP8SSE16x16 = SSE16x16;
VP8SSE16x8 = SSE16x8;
VP8SSE8x8 = SSE8x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8Mean16x4 = Mean16x4;
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point (TODO(skal): move to its own file later)
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndAccumulate(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndAccumulate(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndAccumulate(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
static uint32_t HorizontalAdd16b(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b(&xm);
stats.ym = HorizontalAdd16b(&ym);
stats.xxm = HorizontalAdd32b(&xxm);
stats.xym = HorizontalAdd32b(&xym);
stats.yym = HorizontalAdd32b(&yym);
return VP8SSIMFromStats(&stats);
}
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
VP8AccumulateSSE = AccumulateSSE_SSE2;
VP8SSIMGet = SSIMGet_SSE2;
VP8CollectHistogram = CollectHistogram_SSE2;
VP8EncPredLuma16 = Intra16Preds_SSE2;
VP8EncPredChroma8 = IntraChromaPreds_SSE2;
VP8EncPredLuma4 = Intra4Preds_SSE2;
VP8EncQuantizeBlock = QuantizeBlock_SSE2;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
VP8ITransform = ITransform_SSE2;
VP8FTransform = FTransform_SSE2;
VP8FTransform2 = FTransform2_SSE2;
VP8FTransformWHT = FTransformWHT_SSE2;
VP8SSE16x16 = SSE16x16_SSE2;
VP8SSE16x8 = SSE16x8_SSE2;
VP8SSE8x8 = SSE8x8_SSE2;
VP8SSE4x4 = SSE4x4_SSE2;
VP8TDisto4x4 = Disto4x4_SSE2;
VP8TDisto16x16 = Disto16x16_SSE2;
VP8Mean16x4 = Mean16x4_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

@ -11,21 +11,21 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
#include "./common_sse2.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/common_sse2.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform(a, b, w);
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_SSE41(a + x + y, b + x + y, w);
}
}
return D;
@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
#undef PSHUFB_CST
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, NULL, mtx);
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
VP8CollectHistogram = CollectHistogram;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram_SSE41;
VP8EncQuantizeBlock = QuantizeBlock_SSE41;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
VP8TDisto4x4 = Disto4x4_SSE41;
VP8TDisto16x16 = Disto16x16_SSE41;
}
#else // !WEBP_USE_SSE41

@ -11,7 +11,7 @@
//
// Author: Urvang (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@ -20,16 +20,17 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@ -41,10 +42,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLine(in, preds - stride, out, 1, inverse);
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in, preds - stride, out, 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
PredictLine(in, preds, out, width, inverse);
PredictLine_C(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
while (row < last_row) {
int w;
// leftmost pixel: predict from above.
PredictLine(in, preds - stride, out, 1, inverse);
PredictLine_C(in, preds - stride, out, 1, inverse);
for (w = 1; w < width; ++w) {
const int pred = GradientPredictor(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
out[w] = in[w] + (inverse ? pred : -pred);
}
++row;
@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
#if !WEBP_NEON_OMIT_C_CODE
static void HorizontalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
static void VerticalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
static void GradientFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
uint8_t pred = (prev == NULL) ? 0 : prev[0];
int i;
for (i = 0; i < width; ++i) {
@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
#if !WEBP_NEON_OMIT_C_CODE
static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_C(NULL, in, out, width);
} else {
int i;
for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_C(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==out
left = in[i] + GradientPredictor(left, top, top_left);
left = in[i] + GradientPredictor_C(left, top, top_left);
top_left = top;
out[i] = left;
}
@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
#if !WEBP_NEON_OMIT_C_CODE
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
WebPFilters[WEBP_FILTER_NONE] = NULL;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
#if !WEBP_NEON_OMIT_C_CODE
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
VP8FiltersInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8FiltersInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8FiltersInitMIPSdspR2();
@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();
}
#endif
assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
filters_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../dsp/dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@ -101,8 +101,8 @@
); \
} while (0)
static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
int length) {
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
int length) {
DO_PREDICT_LINE(src, dst, length, 0);
}
@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
} \
} while (0)
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
} \
} while (0)
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
int temp0;
__asm__ volatile (
"addu %[temp0], %[a], %[b] \n\t"
@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \
const int pred = GradientPredictor(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
} \
} while (0)
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
}
}
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==dst
left = in[i] + GradientPredictor(left, top, top_left);
left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
top_left = top;
out[i] = left;
}
@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,11 +11,11 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "src/dsp/msa_macro.h"
#include <assert.h>
@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
//------------------------------------------------------------------------------
// Horrizontal filter
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* preds = data;
const uint8_t* in = data;
uint8_t* out = filtered_data;
@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void GradientFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
//------------------------------------------------------------------------------
// Vertical filter
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
extern void VP8FiltersInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
}
#else // !WEBP_USE_MSA

@ -11,12 +11,12 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "./neon.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Helpful macros.
@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
}
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
int stride, uint8_t* filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
}
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
int stride, uint8_t* filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
// GradientUnfilter_NEON is correct but slower than the C-version,
// at least on ARM64. For armv7, it's a wash.
// So best is to disable it for now, but keep the idea around...
// #define USE_GRADIENT_UNFILTER
#if !defined(USE_GRADIENT_UNFILTER)
#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
#endif
#if defined(USE_GRADIENT_UNFILTER)
#if (USE_GRADIENT_UNFILTER == 1)
#define GRAD_PROCESS_LANE(L) do { \
const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
#undef GRAD_PROCESS_LANE
static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_NEON(NULL, in, out, width);
} else {
@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
#if defined(USE_GRADIENT_UNFILTER)
#if (USE_GRADIENT_UNFILTER == 1)
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
#endif

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
@ -24,16 +24,16 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
while (row < last_row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
PredictLineTop(in, in - stride, out, width);
PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride;
out += stride;
@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static void GradientPredictDirect(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
static void GradientPredictDirect_SSE2(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
out[0] = in[0] - in[-stride];
GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
//------------------------------------------------------------------------------
// Inverse transforms
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
int i;
__m128i last;
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
for (; i < width; ++i) out[i] = in[i] + out[i - 1];
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
int i;
const int max_pos = width & ~31;
@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
static void GradientPredictInverse(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
static void GradientPredictInverse_SSE2(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
if (length > 0) {
int i;
const int max_pos = length & ~7;
@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
_mm_storel_epi64((__m128i*)&row[i], out);
}
for (; i < length; ++i) {
row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
}
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
out[0] = in[0] + prev[0]; // predict from above
GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
}
}
@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
}
#else // !WEBP_USE_SSE2

@ -13,14 +13,15 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "../dec/vp8li_dec.h"
#include "../utils/endian_inl_utils.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#define MAX_DIFF_COST (1e30f)
@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
// inlined.
#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
(void)top;
return left;
}
static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint32_t left = out[-1];
for (i = 0; i < num_pixels; ++i) {
@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
}
(void)upper;
}
GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
// Inverse prediction.
static void PredictorInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* in, uint32_t* out) {
static void PredictorInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* in, uint32_t* out) {
const int width = transform->xsize_;
if (y_start == 0) { // First Row follows the L (mode=1) mode.
PredictorAdd0(in, NULL, 1, out);
PredictorAdd1(in + 1, NULL, width - 1, out + 1);
PredictorAdd0_C(in, NULL, 1, out);
PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
in += width;
out += width;
++y_start;
@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
const uint32_t* pred_mode_src = pred_mode_base;
int x = 1;
// First pixel follows the T (mode=2) mode.
PredictorAdd2(in, out - width, 1, out);
PredictorAdd2_C(in, out - width, 1, out);
// .. the rest:
while (x < width) {
const VP8LPredictorAddSubFunc pred_func =
@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t argb = src[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
int new_red = red;
int new_blue = argb;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
}
// Color space inverse transform.
static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
} \
}
COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
VP8GetARGBIndex, VP8GetARGBValue)
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
8b, VP8GetAlphaIndex, VP8GetAlphaValue)
COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef COLOR_INDEX_INVERSE
@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break;
case PREDICTOR_TRANSFORM:
PredictorInverseTransform(transform, row_start, row_end, in, out);
PredictorInverseTransform_C(transform, row_start, row_end, in, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
// for the first row in next iteration.
@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
}
break;
case CROSS_COLOR_TRANSFORM:
ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
break;
case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) {
@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LSubSampleSize(transform->xsize_, transform->bits_);
uint32_t* const src = out + out_stride - in_stride;
memmove(src, out, in_stride * sizeof(*src));
ColorIndexInverseTransform(transform, row_start, row_end, src, out);
ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
} else {
ColorIndexInverseTransform(transform, row_start, row_end, in, out);
ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
}
break;
}
@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = ba;
*dst++ = rg;
#else
@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = gb;
*dst++ = rg;
#else
@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
#if !defined(WORDS_BIGENDIAN)
#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
WebPUint32ToMem(dst, BSwap32(argb));
#else // WEBP_REFERENCE_IMPLEMENTATION
dst[0] = (argb >> 24) & 0xff;
dst[1] = (argb >> 16) & 0xff;
dst[2] = (argb >> 8) & 0xff;
dst[3] = (argb >> 0) & 0xff;
#endif
#else // WORDS_BIGENDIAN
dst[0] = (argb >> 0) & 0xff;
dst[1] = (argb >> 8) & 0xff;
dst[2] = (argb >> 16) & 0xff;
dst[3] = (argb >> 24) & 0xff;
#endif
dst += sizeof(argb);
}
} else {
@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
static volatile VP8CPUInfo lossless_last_cpuinfo_used =
(VP8CPUInfo)&lossless_last_cpuinfo_used;
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0; \
(OUT)[1] = IN##1; \
(OUT)[2] = IN##2; \
(OUT)[3] = IN##3; \
(OUT)[4] = IN##4; \
(OUT)[5] = IN##5; \
(OUT)[6] = IN##6; \
(OUT)[7] = IN##7; \
(OUT)[8] = IN##8; \
(OUT)[9] = IN##9; \
(OUT)[10] = IN##10; \
(OUT)[11] = IN##11; \
(OUT)[12] = IN##12; \
(OUT)[13] = IN##13; \
(OUT)[14] = IN##0; /* <- padding security sentinels*/ \
(OUT)[15] = IN##0; \
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0_C; \
(OUT)[1] = IN##1_C; \
(OUT)[2] = IN##2_C; \
(OUT)[3] = IN##3_C; \
(OUT)[4] = IN##4_C; \
(OUT)[5] = IN##5_C; \
(OUT)[6] = IN##6_C; \
(OUT)[7] = IN##7_C; \
(OUT)[8] = IN##8_C; \
(OUT)[9] = IN##9_C; \
(OUT)[10] = IN##10_C; \
(OUT)[11] = IN##11_C; \
(OUT)[12] = IN##12_C; \
(OUT)[13] = IN##13_C; \
(OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
(OUT)[15] = IN##0_C; \
} while (0);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
#if !WEBP_NEON_OMIT_C_CODE
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
#endif
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
VP8LMapColor32b = MapARGB;
VP8LMapColor8b = MapAlpha;
VP8LMapColor32b = MapARGB_C;
VP8LMapColor8b = MapAlpha_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
VP8LDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8LDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LDspInitMIPSdspR2();
@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();
}
#endif
assert(VP8LAddGreenToBlueAndRed != NULL);
assert(VP8LTransformColorInverse != NULL);
assert(VP8LConvertBGRAToRGBA != NULL);
assert(VP8LConvertBGRAToRGB != NULL);
assert(VP8LConvertBGRAToBGR != NULL);
assert(VP8LConvertBGRAToRGBA4444 != NULL);
assert(VP8LConvertBGRAToRGB565 != NULL);
assert(VP8LMapColor32b != NULL);
assert(VP8LMapColor8b != NULL);
lossless_last_cpuinfo_used = VP8GetCPUInfo;
}
#undef COPY_PREDICTOR_ARRAY

@ -15,18 +15,18 @@
#ifndef WEBP_DSP_LOSSLESS_H_
#define WEBP_DSP_LOSSLESS_H_
#include "../webp/types.h"
#include "../webp/decode.h"
#include "src/webp/types.h"
#include "src/webp/decode.h"
#include "../enc/histogram_enc.h"
#include "../utils/utils.h"
#include "src/enc/histogram_enc.h"
#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "../enc/delta_palettization_enc.h"
#include "src/enc/delta_palettization_enc.h"
#endif // WEBP_EXPERIMENTAL_FEATURES
//------------------------------------------------------------------------------
@ -124,7 +124,7 @@ void VP8LDspInit(void);
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* const dst, int num_pixels);
uint32_t* dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,

@ -16,9 +16,9 @@
#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
#define WEBP_DSP_LOSSLESS_COMMON_H_
#include "../webp/types.h"
#include "src/webp/types.h"
#include "../utils/utils.h"
#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
// -----------------------------------------------------------------------------
// PrefixEncode()
static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
const int log_floor = BitsLog2Floor(n);
if (n == (n & ~(n - 1))) { // zero or a power of two.
return log_floor;
}
return log_floor + 1;
}
// Splitting of distance and length codes into prefixes and
// extra bits. The prefixes are encoded with an entropy code
// while the extra bits are stored just as normal bits.

@ -13,15 +13,16 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "../dec/vp8li_dec.h"
#include "../utils/endian_inl_utils.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "./yuv.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
static float FastSLog2Slow(uint32_t v) {
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
static float FastLog2Slow(uint32_t v) {
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
const uint32_t argb = data[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
int new_red = red;
int new_blue = argb;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
static int VectorMismatch(const uint32_t* const array1,
const uint32_t* const array2, int length) {
static int VectorMismatch_C(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len = 0;
while (match_len < length && array1[match_len] == array2[match_len]) {
@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
//------------------------------------------------------------------------------
static double ExtraCost(const uint32_t* population, int length) {
static double ExtraCost_C(const uint32_t* population, int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
return cost;
}
static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
int length) {
static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) {
@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
//------------------------------------------------------------------------------
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_C(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
VP8LDspInit();
#if !WEBP_NEON_OMIT_C_CODE
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LTransformColor = VP8LTransformColor_C;
#endif
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
VP8LFastLog2Slow = FastLog2Slow;
VP8LFastSLog2Slow = FastSLog2Slow;
VP8LFastLog2Slow = FastLog2Slow_C;
VP8LFastSLog2Slow = FastSLog2Slow_C;
VP8LExtraCost = ExtraCost;
VP8LExtraCostCombined = ExtraCostCombined;
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
VP8LHistogramAdd = HistogramAdd;
VP8LHistogramAdd = HistogramAdd_C;
VP8LVectorMismatch = VectorMismatch;
VP8LVectorMismatch = VectorMismatch_C;
VP8LBundleColorMap = VP8LBundleColorMap_C;
VP8LPredictorsSub[0] = PredictorSub0_C;
@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8LEncDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8LEncDspInitMIPS32();
@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();
}
#endif
assert(VP8LSubtractGreenFromBlueAndRed != NULL);
assert(VP8LTransformColor != NULL);
assert(VP8LCollectColorBlueTransforms != NULL);
assert(VP8LCollectColorRedTransforms != NULL);
assert(VP8LFastLog2Slow != NULL);
assert(VP8LFastSLog2Slow != NULL);
assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LHistogramAdd != NULL);
assert(VP8LVectorMismatch != NULL);
assert(VP8LBundleColorMap != NULL);
assert(VP8LPredictorsSub[0] != NULL);
assert(VP8LPredictorsSub[1] != NULL);
assert(VP8LPredictorsSub[2] != NULL);
assert(VP8LPredictorsSub[3] != NULL);
assert(VP8LPredictorsSub[4] != NULL);
assert(VP8LPredictorsSub[5] != NULL);
assert(VP8LPredictorsSub[6] != NULL);
assert(VP8LPredictorsSub[7] != NULL);
assert(VP8LPredictorsSub[8] != NULL);
assert(VP8LPredictorsSub[9] != NULL);
assert(VP8LPredictorsSub[10] != NULL);
assert(VP8LPredictorsSub[11] != NULL);
assert(VP8LPredictorsSub[12] != NULL);
assert(VP8LPredictorsSub[13] != NULL);
assert(VP8LPredictorsSub[14] != NULL);
assert(VP8LPredictorsSub[15] != NULL);
assert(VP8LPredictorsSub_C[0] != NULL);
assert(VP8LPredictorsSub_C[1] != NULL);
assert(VP8LPredictorsSub_C[2] != NULL);
assert(VP8LPredictorsSub_C[3] != NULL);
assert(VP8LPredictorsSub_C[4] != NULL);
assert(VP8LPredictorsSub_C[5] != NULL);
assert(VP8LPredictorsSub_C[6] != NULL);
assert(VP8LPredictorsSub_C[7] != NULL);
assert(VP8LPredictorsSub_C[8] != NULL);
assert(VP8LPredictorsSub_C[9] != NULL);
assert(VP8LPredictorsSub_C[10] != NULL);
assert(VP8LPredictorsSub_C[11] != NULL);
assert(VP8LPredictorsSub_C[12] != NULL);
assert(VP8LPredictorsSub_C[13] != NULL);
assert(VP8LPredictorsSub_C[14] != NULL);
assert(VP8LPredictorsSub_C[15] != NULL);
lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -12,9 +12,9 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#if defined(WEBP_USE_MIPS32)
@ -23,7 +23,7 @@
#include <stdlib.h>
#include <string.h>
static float FastSLog2Slow(uint32_t v) {
static float FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
static float FastLog2Slow(uint32_t v) {
static float FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
// pop += 2;
// }
// return (double)cost;
static double ExtraCost(const uint32_t* const population, int length) {
static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
// pY += 2;
// }
// return (double)cost;
static double ExtraCostCombined(const uint32_t* const X,
const uint32_t* const Y, int length) {
static double ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
VP8LBitEntropyInit(entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
#define ASM_START \
@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
} \
} while (0)
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
VP8LFastSLog2Slow = FastSLog2Slow;
VP8LFastLog2Slow = FastLog2Slow;
VP8LExtraCost = ExtraCost;
VP8LExtraCostCombined = ExtraCostCombined;
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
VP8LHistogramAdd = HistogramAdd;
VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
VP8LFastLog2Slow = FastLog2Slow_MIPS32;
VP8LExtraCost = ExtraCost_MIPS32;
VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
VP8LHistogramAdd = HistogramAdd_MIPS32;
}
#else // !WEBP_USE_MIPS32

@ -12,14 +12,14 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
#include "src/dsp/lossless.h"
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
int num_pixels) {
static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_blue,
int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
return (new_red & 0xff);
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_red,
int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
VP8LTransformColor = TransformColor_MIPSdspR2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,12 +11,12 @@
//
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./lossless.h"
#include "./msa_macro.h"
#include "src/dsp/lossless.h"
#include "src/dsp/msa_macro.h"
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
@ -48,8 +48,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
}
}
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
int num_pixels) {
int i;
uint8_t* ptemp_data = (uint8_t*)argb_data;
v16u8 src0, dst0, tmp0;
@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
VP8LTransformColor = TransformColor_MSA;
}
#else // !WEBP_USE_MSA

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
#include "src/dsp/lossless.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_NEON(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
VP8LTransformColor = TransformColor_NEON;
}
#else // !WEBP_USE_NEON

@ -11,22 +11,23 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "./lossless.h"
#include "./common_sse2.h"
#include "./lossless_common.h"
#include "src/dsp/lossless.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless_common.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = _mm_set_epi16(
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set_epi16(
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
}
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
#define LINE_SIZE 16 // 8 or 16
static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
}
}
static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_SSE2(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
} else {
AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
}
for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
out->literal_[i] = a->literal_[i] + b->literal_[i];
@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
} \
} while (0)
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
//------------------------------------------------------------------------------
static int VectorMismatch(const uint32_t* const array1,
const uint32_t* const array2, int length) {
static int VectorMismatch_SSE2(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len;
if (length >= 12) {
@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
}
// Predictor11: select.
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL|
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LHistogramAdd = HistogramAdd;
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
VP8LVectorMismatch = VectorMismatch;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
VP8LTransformColor = TransformColor_SSE2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LHistogramAdd = HistogramAdd_SSE2;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;
VP8LPredictorsSub[0] = PredictorSub0_SSE2;

@ -11,17 +11,18 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include "./lossless.h"
#include "src/dsp/lossless.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
}
#else // !WEBP_USE_SSE41

@ -12,12 +12,12 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
static void FUNC_NAME(const TYPE* src, \
@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src, \
} \
}
MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef MAP_COLOR_FUNCS
@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average3(left, top[0], top[1]);
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[-1]);
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[0]);
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[-1], top[0]);
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[0], top[1]);
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor10_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Average4(left, top[-1], top[0], top[1]);
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor11_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Select(top[0], left, top[-1]);
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor12_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractFull(left, top[0], top[-1]);
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor13_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractHalf(left, top[0], top[-1]);
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
);
}
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
);
}
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
);
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp3], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 16 \n\t"
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#else
@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp0], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp0], 0(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
);
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp2], %[temp3], 0, 5 \n\t"
"addiu %[src], %[src], 16 \n\t"
"append %[temp2], %[temp1], 16 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#else
@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp4], %[temp5], 0, 11 \n\t"
"addiu %[src], %[src], 4 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp4], 0(%[dst]) \n\t"
#else
"wsbh %[temp4], %[temp4] \n\t"
@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
);
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
extern void VP8LDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
VP8LMapColor32b = MapARGB;
VP8LMapColor8b = MapAlpha;
VP8LPredictors[5] = Predictor5;
VP8LPredictors[6] = Predictor6;
VP8LPredictors[7] = Predictor7;
VP8LPredictors[8] = Predictor8;
VP8LPredictors[9] = Predictor9;
VP8LPredictors[10] = Predictor10;
VP8LPredictors[11] = Predictor11;
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LMapColor32b = MapARGB_MIPSdspR2;
VP8LMapColor8b = MapAlpha_MIPSdspR2;
VP8LPredictors[5] = Predictor5_MIPSdspR2;
VP8LPredictors[6] = Predictor6_MIPSdspR2;
VP8LPredictors[7] = Predictor7_MIPSdspR2;
VP8LPredictors[8] = Predictor8_MIPSdspR2;
VP8LPredictors[9] = Predictor9_MIPSdspR2;
VP8LPredictors[10] = Predictor10_MIPSdspR2;
VP8LPredictors[11] = Predictor11_MIPSdspR2;
VP8LPredictors[12] = Predictor12_MIPSdspR2;
VP8LPredictors[13] = Predictor13_MIPSdspR2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,12 +11,12 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./lossless.h"
#include "./msa_macro.h"
#include "src/dsp/lossless.h"
#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@ -43,7 +43,7 @@
#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
uint64_t pix_d; \
v16u8 src0, src1, src2, dst0, dst1; \
v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
LD_UB2(psrc, 16, src0, src1); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
ST_UB(dst0, pdst); \
@ -109,8 +109,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int i;
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
}
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
}
}
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
}
}
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
extern void VP8LDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
VP8LTransformColorInverse = TransformColorInverse_MSA;
}
#else // !WEBP_USE_MSA

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
#include "src/dsp/lossless.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@ -26,8 +26,8 @@
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Predictor Transform
@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
#endif
for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// x r' x b'
@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
VP8LTransformColorInverse = TransformColorInverse_NEON;
}
#else // !WEBP_USE_NEON

@ -11,21 +11,22 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include "./common_sse2.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Predictor Transform
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
return output;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return output;
}
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
int pa_minus_pb;
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_cvtsi32_si128(a);
@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
__m128i* const avg) {
static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
const uint32_t a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i A0 = _mm_cvtsi32_si128(a0);
@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
return _mm_srli_epi16(sum, 1);
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
__m128i output;
Average2_uint32(a0, a1, &output);
Average2_uint32_SSE2(a0, a1, &output);
return _mm_cvtsi128_si32(output);
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2) {
const __m128i zero = _mm_setzero_si128();
const __m128i avg1 = Average2_uint32_16(a0, a2);
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(avg1, A1);
const __m128i avg2 = _mm_srli_epi16(sum, 1);
@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return output;
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_uint32_16(a0, a1);
const __m128i avg2 = Average2_uint32_16(a2, a3);
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
const __m128i sum = _mm_add_epi16(avg2, avg1);
const __m128i avg3 = _mm_srli_epi16(sum, 1);
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
}
static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
const uint32_t pred = Average2_SSE2(left, top[-1]);
return pred;
}
static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
const uint32_t pred = Average2_SSE2(left, top[0]);
return pred;
}
static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
const uint32_t pred = Average2_SSE2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
return pred;
}
@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
#undef GENERATE_PREDICTOR_2
// Predictor10: average of (average of (L,TL), average of (T, TR)).
#define DO_PRED10(OUT) do { \
__m128i avgLTL, avg; \
Average2_m128i(&L, &TL, &avgLTL); \
Average2_m128i(&avgTTR, &avgLTL, &avg); \
L = _mm_add_epi8(avg, src); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED10_SHIFT do { \
/* Rotate the pre-computed values for the next iteration.*/ \
avgTTR = _mm_srli_si128(avgTTR, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
} while (0)
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
int i;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR;
Average2_m128i(&T, &TR, &avgTTR);
for (j = 0; j < 4; ++j) {
__m128i avgLTL, avg;
Average2_m128i(&L, &TL, &avgLTL);
Average2_m128i(&avgTTR, &avgLTL, &avg);
L = _mm_add_epi8(avg, src);
out[i + j] = _mm_cvtsi128_si32(L);
// Rotate the pre-computed values for the next iteration.
avgTTR = _mm_srli_si128(avgTTR, 4);
TL = _mm_srli_si128(TL, 4);
src = _mm_srli_si128(src, 4);
}
DO_PRED10(0);
DO_PRED10_SHIFT;
DO_PRED10(1);
DO_PRED10_SHIFT;
DO_PRED10(2);
DO_PRED10_SHIFT;
DO_PRED10(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED10
#undef DO_PRED10_SHIFT
// Predictor11: select.
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
*out = _mm_packs_epi32(s_lo, s_hi);
}
#define DO_PRED11(OUT) do { \
const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
const __m128i A = _mm_and_si128(mask, L); \
const __m128i B = _mm_andnot_si128(mask, T); \
const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
L = _mm_add_epi8(src, pred); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED11_SHIFT do { \
/* Shift the pre-computed value for the next iteration.*/ \
T = _mm_srli_si128(T, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
pa = _mm_srli_si128(pa, 4); \
} while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
int i;
__m128i pa;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa;
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
for (j = 0; j < 4; ++j) {
const __m128i L_lo = _mm_unpacklo_epi32(L, L);
const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL|
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
const __m128i B = _mm_andnot_si128(mask, T);
const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
L = _mm_add_epi8(src, pred);
out[i + j] = _mm_cvtsi128_si32(L);
// Shift the pre-computed value for the next iteration.
T = _mm_srli_si128(T, 4);
TL = _mm_srli_si128(TL, 4);
src = _mm_srli_si128(src, 4);
pa = _mm_srli_si128(pa, 4);
{
// We can unpack with any value on the upper 32 bits, provided it's the
// same on both operands (so that their sum of abs diff is zero). Here we
// use T.
const __m128i T_lo = _mm_unpacklo_epi32(T, T);
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
const __m128i T_hi = _mm_unpackhi_epi32(T, T);
const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
}
DO_PRED11(0);
DO_PRED11_SHIFT;
DO_PRED11(1);
DO_PRED11_SHIFT;
DO_PRED11(2);
DO_PRED11_SHIFT;
DO_PRED11(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED11
#undef DO_PRED11_SHIFT
// Predictor12: ClampedAddSubtractFull.
#define DO_PRED12(DIFF, LANE, OUT) \
do { \
const __m128i all = _mm_add_epi16(L, (DIFF)); \
const __m128i alls = _mm_packus_epi16(all, all); \
const __m128i res = _mm_add_epi8(src, alls); \
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
L = _mm_unpacklo_epi8(res, zero); \
#define DO_PRED12(DIFF, LANE, OUT) do { \
const __m128i all = _mm_add_epi16(L, (DIFF)); \
const __m128i alls = _mm_packus_epi16(all, all); \
const __m128i res = _mm_add_epi8(src, alls); \
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
L = _mm_unpacklo_epi8(res, zero); \
} while (0)
#define DO_PRED12_SHIFT(DIFF, LANE) do { \
/* Shift the pre-computed value for the next iteration.*/ \
if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
src = _mm_srli_si128(src, 4); \
} while (0)
@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
DO_PRED12(diff_lo, 0, 0);
DO_PRED12_SHIFT(diff_lo, 0);
DO_PRED12(diff_lo, 1, 1);
DO_PRED12_SHIFT(diff_lo, 1);
DO_PRED12(diff_hi, 0, 2);
DO_PRED12_SHIFT(diff_hi, 0);
DO_PRED12(diff_hi, 1, 3);
}
if (i != num_pixels) {
@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
}
}
#undef DO_PRED12
#undef DO_PRED12_SHIFT
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 13.
@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
uint8_t* dst) {
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
}
}
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
_mm_storeu_si128(out++, rgba0);
_mm_storeu_si128(out++, rgba4);
const __m128i A1 = _mm_loadu_si128(in++);
const __m128i A2 = _mm_loadu_si128(in++);
const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F1 = _mm_or_si128(E1, C1);
const __m128i F2 = _mm_or_si128(E2, C2);
_mm_storeu_si128(out++, F1);
_mm_storeu_si128(out++, F2);
num_pixels -= 8;
}
// left-overs
@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i* in = (const __m128i*)src;
@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
#else
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
}
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
const __m128i b1 = _mm_srli_epi16(b0, 3);
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
#else
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
}
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;
@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
VP8LTransformColorInverse = TransformColorInverse_SSE2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
}
#else // !WEBP_USE_SSE2

@ -22,6 +22,7 @@
#endif
#ifdef CLANG_BUILD
#define ALPHAVAL (-1)
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
#define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
@ -32,6 +33,7 @@
#define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
#define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
#else
#define ALPHAVAL (0xff)
#define ADDVI_H(a, b) (a + b)
#define ADDVI_W(a, b) (a + b)
#define SRAI_B(a, b) (a >> b)

@ -14,11 +14,12 @@
#include <arm_neon.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except aarch64 where the inline assembly is incompatible.
#if defined(__aarch64__)
// everywhere except newer clang/gcc or aarch64 where the inline assembly is
// incompatible.
#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
#define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif
@ -43,11 +44,11 @@
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);

@ -13,8 +13,8 @@
#include <assert.h>
#include "./dsp.h"
#include "../utils/rescaler_utils.h"
#include "src/dsp/dsp.h"
#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@ -25,7 +25,8 @@
//------------------------------------------------------------------------------
// Row import
void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
}
}
void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
}
}
void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
#if !defined(WEBP_REDUCE_SIZE)
#if !WEBP_NEON_OMIT_C_CODE
WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
#endif
WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
WebPRescalerDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPRescalerDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPRescalerDspInitMIPS32();
@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();
}
#endif
assert(WebPRescalerExportRowExpand != NULL);
assert(WebPRescalerExportRowShrink != NULL);
assert(WebPRescalerImportRowExpand != NULL);
assert(WebPRescalerImportRowShrink != NULL);
#endif // WEBP_REDUCE_SIZE
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -11,17 +11,18 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Row import
static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
}
}
static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
static void ExportRowExpand(WebPRescaler* const wrk) {
static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
}
}
static void ExportRowShrink(WebPRescaler* const wrk) {
static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
WebPRescalerImportRowExpand = ImportRowExpand;
WebPRescalerImportRowShrink = ImportRowShrink;
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
}
#else // !WEBP_USE_MIPS32

@ -11,12 +11,12 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -24,7 +24,7 @@
//------------------------------------------------------------------------------
// Row export
static void ExportRowShrink(WebPRescaler* const wrk) {
static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
int i;
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
}
}
static void ExportRowExpand(WebPRescaler* const wrk) {
static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
int i;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -11,14 +11,14 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "./msa_macro.h"
#include "src/utils/rescaler_utils.h"
#include "src/dsp/msa_macro.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
}
}
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
}
}
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand;
WebPRescalerExportRowShrink = RescalerExportRowShrink;
WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MSA

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
#include <arm_neon.h>
#include <assert.h>
#include "./neon.h"
#include "../utils/rescaler_utils.h"
#include "src/dsp/neon.h"
#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -41,9 +41,9 @@
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
static uint32x4_t Interpolate(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
return E;
}
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < max_span; x_out += 8) {
const uint32x4_t C0 =
Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
const uint32x4_t C1 =
Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
}
}
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand;
WebPRescalerExportRowShrink = RescalerExportRowShrink;
WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
}
#else // !WEBP_USE_NEON

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
#include <emmintrin.h>
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "../utils/utils.h"
#include "src/utils/rescaler_utils.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@ -27,7 +27,7 @@
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
WebPRescalerImportRowExpandC(wrk, src);
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadTwoPixels(src, &cur_pixels);
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
LoadTwoPixels(src, &cur_pixels);
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
accum += x_add;
}
@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
WebPRescalerImportRowExpandC(wrk, src);
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadHeightPixels(src, &cur_pixels);
LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadHeightPixels(src, &cur_pixels);
LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(accum == 0);
}
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
WebPRescalerImportRowShrinkC(wrk, src);
WebPRescalerImportRowShrink_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);
@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
}
}
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
_mm_storel_epi64((__m128i*)dst, G);
}
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
if (wrk->y_accum == 0) {
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(A0, B0);
const __m128i C1 = _mm_add_epi64(A1, B1);
@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
}
}
static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i zero = _mm_setzero_si128();
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
}
#else // !WEBP_USE_SSE2

@ -0,0 +1,166 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "src/dsp/dsp.h"
#if !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
#if !defined(WEBP_DISABLE_STATS)
static uint32_t AccumulateSSE_C(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif
//------------------------------------------------------------------------------
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
extern void VP8SSIMDspInitSSE2(void);
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
(VP8CPUInfo)&ssim_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
ssim_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -0,0 +1,165 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "src/dsp/common_sse2.h"
#if !defined(WEBP_DISABLE_STATS)
// Helper function
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
// zero-extend to 16b
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
// multiply with self
const __m128i sum1 = _mm_madd_epi16(C0, C0);
const __m128i sum2 = _mm_madd_epi16(C1, C1);
*sum = _mm_add_epi32(sum1, sum2);
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndSquare_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif // !defined(WEBP_DISABLE_STATS)
#if !defined(WEBP_REDUCE_SIZE)
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b_SSE2(&xm);
stats.ym = HorizontalAdd16b_SSE2(&ym);
stats.xxm = HorizontalAdd32b_SSE2(&xxm);
stats.xym = HorizontalAdd32b_SSE2(&xym);
stats.yym = HorizontalAdd32b_SSE2(&yym);
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_SSE2;
#endif
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGet = SSIMGet_SSE2;
#endif
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

@ -11,8 +11,8 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "./yuv.h"
#include "src/dsp/dsp.h"
#include "src/dsp/yuv.h"
#include <assert.h>
@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
top_dst + (2 * x - 1) * (XSTEP)); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
top_dst + (2 * x - 0) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
bottom_dst + (2 * x - 1) * (XSTEP)); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
bottom_dst + (2 * x + 0) * (XSTEP)); \
} \
tl_uv = t_uv; \
l_uv = uv; \
@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
top_dst + (len - 1) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
bottom_dst + (len - 1) * (XSTEP)); \
} \
} \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
#if !WEBP_NEON_OMIT_C_CODE
UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2)
#else
static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len) {
(void)top_y;
(void)bottom_y;
(void)top_u;
(void)top_v;
(void)cur_u;
(void)cur_v;
(void)top_dst;
(void)bottom_dst;
(void)len;
assert(0); // COLORSPACE SUPPORT NOT COMPILED
}
#define UpsampleArgbLinePair_C EmptyUpsampleFunc
#define UpsampleRgbLinePair_C EmptyUpsampleFunc
#define UpsampleBgrLinePair_C EmptyUpsampleFunc
#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
#endif // WEBP_REDUCE_CSP
#endif
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
WebPInitUpsamplers();
VP8YUVInit();
#ifdef FANCY_UPSAMPLING
return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
#else
@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}
YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3)
YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3)
YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4)
YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4)
YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4)
YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2)
YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4)
YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3)
YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3)
YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4)
YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2)
#else
static void EmptyYuv444Func(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
(void)y;
(void)u;
(void)v;
(void)dst;
(void)len;
}
#define WebPYuv444ToRgb_C EmptyYuv444Func
#define WebPYuv444ToBgr_C EmptyYuv444Func
#define WebPYuv444ToArgb_C EmptyYuv444Func
#define WebPYuv444ToRgba4444_C EmptyYuv444Func
#define WebPYuv444ToRgb565_C EmptyYuv444Func
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC;
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC;
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC;
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC;
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C;
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC;
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC;
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC;
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C;
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C;
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#if !WEBP_NEON_OMIT_C_CODE
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
#endif
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
WebPInitUpsamplersSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPInitUpsamplersNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitUpsamplersMIPSdspR2();
@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();
}
#endif
assert(WebPUpsamplers[MODE_RGBA] != NULL);
assert(WebPUpsamplers[MODE_BGRA] != NULL);
assert(WebPUpsamplers[MODE_rgbA] != NULL);
assert(WebPUpsamplers[MODE_bgrA] != NULL);
assert(WebPUpsamplers[MODE_RGB] != NULL);
assert(WebPUpsamplers[MODE_BGR] != NULL);
assert(WebPUpsamplers[MODE_ARGB] != NULL);
assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
assert(WebPUpsamplers[MODE_RGB_565] != NULL);
assert(WebPUpsamplers[MODE_Argb] != NULL);
assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
#endif // FANCY_UPSAMPLING
upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
}

@ -12,14 +12,12 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
#include "./yuv.h"
#if !defined(WEBP_YUV_USE_TABLE)
#include "src/dsp/yuv.h"
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
const int t1 = MultHi(Y, 19077); \
@ -48,6 +46,7 @@
); \
} while (0)
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
{
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
{
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
#endif
}
}
#endif // WEBP_YUV_USE_TABLE
#endif // WEBP_REDUCE_CSP
//-----------------------------------------------------------------------------
// Alpha handling variants
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
int r, g, b;
@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
argb[2] = g;
argb[3] = b;
}
#endif // WEBP_REDUCE_CSP
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
int r, g, b;
@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
extern void WebPInitUpsamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
#endif // WEBP_REDUCE_CSP
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -12,12 +12,12 @@
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include <string.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "./yuv.h"
#include "src/dsp/msa_macro.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(R, G, B, A, dst);
@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(B, G, R, A, dst);
@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(A, R, G, B, dst);
@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
#else
#else
CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
#endif
#endif
y += 16;
u += 16;
v += 16;
@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(y, u, v, GB, RG, 16, dst);
#else
#else
CALC_RGB565(y, u, v, RG, GB, 16, dst);
#endif
#endif
y += 16;
u += 16;
v += 16;
@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 16, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 16, temp);
@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 8, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 8, temp);
@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
} \
}
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING

@ -12,15 +12,15 @@
// Author: mans@mansr.com (Mans Rullgard)
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <arm_neon.h>
#include <string.h>
#include "./neon.h"
#include "./yuv.h"
#include "src/dsp/neon.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -58,8 +58,8 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
uint8_t *out) {
static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
uint8_t *out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
Upsample16Pixels(r1, r2, out); \
Upsample16Pixels_NEON(r1, r2, out); \
}
//-----------------------------------------------------------------------------
@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
}
// NEON variants of the fancy upsampler.
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
#if !defined(WEBP_REDUCE_CSP)
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING

@ -11,14 +11,14 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
#include "./yuv.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -83,13 +83,13 @@
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels(r1, r2, out); \
Upsample32Pixels_SSE2(r1, r2, out); \
}
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
top_dst + ((cur_x) + n) * XSTEP); \
FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n], \
(top_dst) + ((cur_x) + n) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
if ((bottom_y) != NULL) { \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
bottom_dst + ((cur_x) + n) * XSTEP); \
FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
(bottom_dst) + ((cur_x) + n) * (XSTEP)); \
} \
} \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
if (bottom_y != NULL) { \
FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
bottom_dst + (cur_x) * XSTEP); \
FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
if ((bottom_y) != NULL) { \
FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
(bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// SSE2 variants of the fancy upsampler.
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef GET_M
#undef PACK_AND_STORE
@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \
const uint8_t* v, uint8_t* dst, int len); \
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
for (i = 0; i < max_len; i += 32) { \
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
} \
if (i < len) { /* C-fallback */ \
WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
WebPYuv444ToRgba4444_C, 2)
YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
#endif // WEBP_REDUCE_CSP
}
#else

@ -11,63 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./yuv.h"
#include "src/dsp/yuv.h"
#include <assert.h>
#include <stdlib.h>
#if defined(WEBP_YUV_USE_TABLE)
static int done = 0;
static WEBP_INLINE uint8_t clip(int v, int max_value) {
return v < 0 ? 0 : v > max_value ? max_value : v;
}
int16_t VP8kVToR[256], VP8kUToB[256];
int32_t VP8kVToG[256], VP8kUToG[256];
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
int i;
if (done) {
return;
}
#ifndef USE_YUVj
for (i = 0; i < 256; ++i) {
VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
VP8kVToG[i] = -45773 * (i - 128);
VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
}
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
#else
for (i = 0; i < 256; ++i) {
VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
VP8kVToG[i] = -46802 * (i - 128);
VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
}
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
const int k = i;
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
#endif
done = 1;
}
#else
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
#endif // WEBP_YUV_USE_TABLE
//-----------------------------------------------------------------------------
// Plain-C version
@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * XSTEP; \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
FUNC(y[1], u[0], v[0], dst + XSTEP); \
FUNC(y[1], u[0], v[0], dst + (XSTEP)); \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
dst += 2 * (XSTEP); \
} \
if (len & 1) { \
FUNC(y[0], u[0], v[0], dst); \
@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//-----------------------------------------------------------------------------
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
(VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPConvertARGBToY = ConvertARGBToY;
WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
WebPConvertRGB24ToY = ConvertRGB24ToY;
WebPConvertBGR24ToY = ConvertBGR24ToY;
WebPConvertRGB24ToY = ConvertRGB24ToY_C;
WebPConvertBGR24ToY = ConvertBGR24ToY_C;
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
}
#endif // WEBP_USE_SSE2
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_USE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);
assert(WebPConvertRGB24ToY != NULL);
assert(WebPConvertBGR24ToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL);
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -35,18 +35,8 @@
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
#include "./dsp.h"
#include "../dec/vp8_dec.h"
#if defined(WEBP_EXPERIMENTAL_FEATURES)
// Do NOT activate this feature for real compression. This is only experimental!
// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
// This colorspace is close to Rec.601's Y'CbCr model with the notable
// difference of allowing larger range for luma/chroma.
// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
// #define USE_YUVj
#endif
#include "src/dsp/dsp.h"
#include "src/dec/vp8_dec.h"
//------------------------------------------------------------------------------
// YUV -> RGB conversion
@ -58,12 +48,8 @@ extern "C" {
enum {
YUV_FIX = 16, // fixed-point precision for RGB->YUV
YUV_HALF = 1 << (YUV_FIX - 1),
YUV_MASK = (256 << YUV_FIX) - 1,
YUV_RANGE_MIN = -227, // min value of r/g/b output
YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
YUV_FIX2 = 6, // fixed-point precision for YUV->RGB
YUV_HALF2 = 1 << YUV_FIX2 >> 1,
YUV_MASK2 = (256 << YUV_FIX2) - 1
};
@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 5 usable bits
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 4 usable bits
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
rgba[3] = 0xff;
}
// Must be called before everything, to initialize the tables.
void VP8YUVInit(void);
//-----------------------------------------------------------------------------
// SSE2 extra functions (mostly for upsampling_sse2.c)
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
#endif // WEBP_USE_SSE2
@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
}
#ifndef USE_YUVj
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 16839 * r + 33059 * g + 6420 * b;
return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
return VP8ClipUV(v, rounding);
}
#else
// This JPEG-YUV colorspace, only for comparison!
// These are also 16bit precision coefficients from Rec.601, but with full
// [0..255] output range.
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 19595 * r + 38470 * g + 7471 * b;
return (luma + rounding) >> YUV_FIX; // no need to clip
}
static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
const int u = -11058 * r - 21710 * g + 32768 * b;
return VP8ClipUV(u, rounding);
}
static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
const int v = 32768 * r - 27439 * g - 5329 * b;
return VP8ClipUV(v, rounding);
}
#endif // USE_YUVj
#ifdef __cplusplus
} // extern "C"
#endif

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./yuv.h"
#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
#undef ROW_FUNC
@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
}
#else // !WEBP_USE_MIPS32

@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./yuv.h"
#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
#undef ROW_FUNC
#undef ASM_CLOBBER_LIST
@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

@ -0,0 +1,288 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/yuv.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <stdlib.h>
#include "src/dsp/neon.h"
//-----------------------------------------------------------------------------
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
const uint8x8_t G,
const uint8x8_t B) {
const uint16x8_t r = vmovl_u8(R);
const uint16x8_t g = vmovl_u8(G);
const uint16x8_t b = vmovl_u8(B);
const uint16x4_t r_lo = vget_low_u16(r);
const uint16x4_t r_hi = vget_high_u16(r);
const uint16x4_t g_lo = vget_low_u16(g);
const uint16x4_t g_hi = vget_high_u16(g);
const uint16x4_t b_lo = vget_low_u16(b);
const uint16x4_t b_hi = vget_high_u16(b);
const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u);
const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u);
const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
vrshrn_n_u32(tmp2_hi, 16));
const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
return vqmovn_u16(Y2);
}
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr);
const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
//-----------------------------------------------------------------------------
// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
#define MULTIPLY_16b_PREAMBLE(r, g, b) \
const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
vshrn_n_s32(tmp2_hi, 16)); \
DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
} while (0)
// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
MULTIPLY_16b_PREAMBLE(r, g, b); \
MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
int16x8_t U, V;
CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
vst1_u8(u + i, vqrshrun_n_s16(U, 2));
vst1_u8(v + i, vqrshrun_n_s16(V, 2));
}
for (; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
}
}
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds
const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
int16x8_t U_tmp, V_tmp;
CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
{
const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
if (do_store) {
vst1_u8(u, U);
vst1_u8(v, V);
} else {
const uint8x8_t prev_u = vld1_u8(u);
const uint8x8_t prev_v = vld1_u8(v);
vst1_u8(u, vrhadd_u8(U, prev_u));
vst1_u8(v, vrhadd_u8(V, prev_v));
}
}
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
WebPConvertARGBToY = ConvertARGBToY_NEON;
WebPConvertARGBToUV = ConvertARGBToUV_NEON;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y_NEON(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(MAX_Y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_y_NEON(new_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const int16x8_t max = vdupq_n_s16(MAX_Y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t d0 = vaddq_s16(c1, a0);
const int16x8_t d1 = vaddq_s16(c0, a1);
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
#endif // WEBP_USE_NEON

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save