Merge remote-tracking branch 'upstream/3.4' into merge-3.4

OpenCV FFmpeg wrapper download links are preserved from ffmpeg/master branch
pull/11756/head
Alexander Alekhin 7 years ago
commit 0d6518aaa0
  1. 2
      .gitattributes
  2. 63
      3rdparty/ffmpeg/ffmpeg-download.ps1.in
  3. 5
      3rdparty/ffmpeg/ffmpeg.cmake
  4. 12
      3rdparty/libwebp/src/dec/frame_dec.c
  5. 2
      3rdparty/libwebp/src/dec/vp8_dec.c
  6. 6
      3rdparty/libwebp/src/dec/vp8i_dec.h
  7. 10
      3rdparty/libwebp/src/dec/vp8l_dec.c
  8. 6
      3rdparty/libwebp/src/demux/demux.c
  9. 29
      3rdparty/libwebp/src/dsp/alpha_processing.c
  10. 46
      3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
  11. 68
      3rdparty/libwebp/src/dsp/argb.c
  12. 110
      3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
  13. 70
      3rdparty/libwebp/src/dsp/argb_sse2.c
  14. 14
      3rdparty/libwebp/src/dsp/common_sse2.h
  15. 132
      3rdparty/libwebp/src/dsp/common_sse41.h
  16. 9
      3rdparty/libwebp/src/dsp/cost.c
  17. 9
      3rdparty/libwebp/src/dsp/dec.c
  18. 52
      3rdparty/libwebp/src/dsp/dsp.h
  19. 9
      3rdparty/libwebp/src/dsp/enc.c
  20. 9
      3rdparty/libwebp/src/dsp/filters.c
  21. 9
      3rdparty/libwebp/src/dsp/lossless.c
  22. 4
      3rdparty/libwebp/src/dsp/lossless.h
  23. 9
      3rdparty/libwebp/src/dsp/lossless_enc.c
  24. 27
      3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
  25. 94
      3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
  26. 19
      3rdparty/libwebp/src/dsp/lossless_sse2.c
  27. 7
      3rdparty/libwebp/src/dsp/rescaler.c
  28. 20
      3rdparty/libwebp/src/dsp/rescaler_sse2.c
  29. 9
      3rdparty/libwebp/src/dsp/ssim.c
  30. 30
      3rdparty/libwebp/src/dsp/upsampling.c
  31. 6
      3rdparty/libwebp/src/dsp/upsampling_msa.c
  32. 32
      3rdparty/libwebp/src/dsp/upsampling_sse2.c
  33. 239
      3rdparty/libwebp/src/dsp/upsampling_sse41.c
  34. 29
      3rdparty/libwebp/src/dsp/yuv.c
  35. 13
      3rdparty/libwebp/src/dsp/yuv.h
  36. 4
      3rdparty/libwebp/src/dsp/yuv_sse2.c
  37. 613
      3rdparty/libwebp/src/dsp/yuv_sse41.c
  38. 5
      3rdparty/libwebp/src/enc/alpha_enc.c
  39. 6
      3rdparty/libwebp/src/enc/analysis_enc.c
  40. 455
      3rdparty/libwebp/src/enc/delta_palettization_enc.c
  41. 25
      3rdparty/libwebp/src/enc/delta_palettization_enc.h
  42. 26
      3rdparty/libwebp/src/enc/frame_enc.c
  43. 9
      3rdparty/libwebp/src/enc/histogram_enc.c
  44. 5
      3rdparty/libwebp/src/enc/histogram_enc.h
  45. 8
      3rdparty/libwebp/src/enc/iterator_enc.c
  46. 2
      3rdparty/libwebp/src/enc/near_lossless_enc.c
  47. 148
      3rdparty/libwebp/src/enc/picture_csp_enc.c
  48. 15
      3rdparty/libwebp/src/enc/picture_psnr_enc.c
  49. 87
      3rdparty/libwebp/src/enc/quant_enc.c
  50. 16
      3rdparty/libwebp/src/enc/vp8i_enc.h
  51. 79
      3rdparty/libwebp/src/enc/vp8l_enc.c
  52. 9
      3rdparty/libwebp/src/enc/webp_enc.c
  53. 6
      3rdparty/libwebp/src/mux/muxi.h
  54. 7
      3rdparty/libwebp/src/utils/endian_inl_utils.h
  55. 25
      apps/CMakeLists.txt
  56. 2
      apps/interactive-calibration/main.cpp
  57. 33
      apps/version/CMakeLists.txt
  58. 50
      apps/version/opencv_version.cpp
  59. 21
      cmake/OpenCVDetectPython.cmake
  60. 94
      doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
  61. 6
      doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
  62. 0
      doc/py_tutorials/py_photo/py_hdr/images/ldr_debevec.jpg
  63. 34
      doc/py_tutorials/py_photo/py_hdr/py_hdr.markdown
  64. 164
      doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.markdown
  65. 14
      doc/tutorials/imgproc/shapedescriptors/bounding_rotated_ellipses/bounding_rotated_ellipses.markdown
  66. 14
      doc/tutorials/imgproc/shapedescriptors/find_contours/find_contours.markdown
  67. 15
      doc/tutorials/imgproc/shapedescriptors/hull/hull.markdown
  68. 14
      doc/tutorials/imgproc/shapedescriptors/moments/moments.markdown
  69. 14
      doc/tutorials/imgproc/shapedescriptors/point_polygon_test/point_polygon_test.markdown
  70. 12
      doc/tutorials/imgproc/table_of_content_imgproc.markdown
  71. 13
      doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
  72. 24
      doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
  73. 2
      doc/tutorials/objdetect/table_of_content_objdetect.markdown
  74. 190
      doc/tutorials/photo/hdr_imaging/hdr_imaging.markdown
  75. 2
      doc/tutorials/photo/table_of_content_photo.markdown
  76. 10
      modules/core/misc/java/test/CoreTest.java
  77. 7
      modules/core/src/copy.cpp
  78. 8
      modules/core/src/logger.cpp
  79. 1
      modules/core/src/system.cpp
  80. 26
      modules/core/test/test_mat.cpp
  81. 8
      modules/dnn/include/opencv2/dnn/all_layers.hpp
  82. 19
      modules/dnn/include/opencv2/dnn/dnn.hpp
  83. 76
      modules/dnn/perf/perf_net.cpp
  84. 3
      modules/dnn/src/darknet/darknet_io.cpp
  85. 62
      modules/dnn/src/dnn.cpp
  86. 2
      modules/dnn/src/init.cpp
  87. 42
      modules/dnn/src/layers/batch_norm_layer.cpp
  88. 2
      modules/dnn/src/layers/blank_layer.cpp
  89. 2
      modules/dnn/src/layers/concat_layer.cpp
  90. 45
      modules/dnn/src/layers/convolution_layer.cpp
  91. 2
      modules/dnn/src/layers/crop_and_resize_layer.cpp
  92. 2
      modules/dnn/src/layers/detection_output_layer.cpp
  93. 61
      modules/dnn/src/layers/elementwise_layers.cpp
  94. 2
      modules/dnn/src/layers/eltwise_layer.cpp
  95. 2
      modules/dnn/src/layers/flatten_layer.cpp
  96. 2
      modules/dnn/src/layers/fully_connected_layer.cpp
  97. 10
      modules/dnn/src/layers/layers_common.simd.hpp
  98. 2
      modules/dnn/src/layers/lrn_layer.cpp
  99. 2
      modules/dnn/src/layers/max_unpooling_layer.cpp
  100. 2
      modules/dnn/src/layers/normalize_bbox_layer.cpp
  101. Some files were not shown because too many files have changed in this diff Show More

2
.gitattributes vendored

@ -81,6 +81,8 @@ org.eclipse.jdt.core.prefs -text whitespace=cr-at-eol merge=union
*.cmd text eol=crlf
*.cmd.tmpl text eol=crlf
*.dsp text eol=crlf -whitespace
*.ps1 text eol=crlf
*.ps1.in text eol=crlf
*.sln text eol=crlf -whitespace
*.vcproj text eol=crlf -whitespace merge=union
*.vcxproj text eol=crlf -whitespace merge=union

@ -0,0 +1,63 @@
$url = "https://raw.githubusercontent.com/opencv/opencv_3rdparty/@FFMPEG_BINARIES_COMMIT@/ffmpeg/opencv_ffmpeg_64.dll"
$expected_md5 = "@FFMPEG_FILE_HASH_BIN64@"
$output = "$PSScriptRoot\@OPENCV_BIN_INSTALL_PATH@\opencv_ffmpeg@OPENCV_DLLVERSION@_64.dll"
Write-Output ("=" * 120)
try {
Get-content -Path "$PSScriptRoot\etc\licenses\ffmpeg-readme.txt" -ErrorAction 'Stop'
} catch {
Write-Output "Refer to OpenCV FFmpeg wrapper readme notes about library usage / licensing details."
}
Write-Output ("=" * 120)
Write-Output ""
if(![System.IO.File]::Exists($output)) {
try {
Write-Output ("Downloading: " + $output)
Import-Module BitsTransfer
$start_time = Get-Date
Start-BitsTransfer -Source $url -Destination $output -ErrorAction 'Stop'
Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
} catch {
$_ # Dump error
try {
Write-Output ("Downloading (second attempt): " + $output)
$start_time = Get-Date
Invoke-WebRequest -Uri $url -OutFile $output
Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
} catch {
Write-Output ("Can't download file: " + $output)
Write-Output ("URL: " + $url)
Write-Output "You need to download this file manually. Stop"
Pause
Exit
}
}
} else {
Write-Output ("File exists: " + $output)
Write-Output ("Downloading is skipped. Remove this file and re-run this script to force downloading.")
}
if(![System.IO.File]::Exists($output)) {
Write-Output ("Destination file not found: " + $output)
Write-Output "Stop"
Pause
Exit
}
try {
$hash = Get-FileHash $output -Algorithm MD5 -ErrorAction 'Stop'
if($hash.Hash -eq $expected_md5) {
Write-Output "MD5 check passed"
} else {
Write-Output ("MD5 : " + $hash.Hash.toLower())
Write-Output ("Expected: " + $expected_md5)
Write-Output "MD5 hash mismatch"
}
} catch {
$_ # Dump error
Write-Output "Can't check MD5 hash (requires PowerShell 4+)"
}
Pause
Write-Output "Exit"

@ -35,3 +35,8 @@ function(download_win_ffmpeg script_var)
set(${script_var} "${FFMPEG_DOWNLOAD_DIR}/ffmpeg_version.cmake" PARENT_SCOPE)
endif()
endfunction()
if(OPENCV_INSTALL_FFMPEG_DOWNLOAD_SCRIPT)
configure_file("${CMAKE_CURRENT_LIST_DIR}/ffmpeg-download.ps1.in" "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" @ONLY)
install(FILES "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" DESTINATION "." COMPONENT libs)
endif()

@ -400,7 +400,9 @@ static void DitherRow(VP8Decoder* const dec) {
#define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB
// Finalize and transmit a complete row. Return false in case of user-abort.
static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
static int FinishRow(void* arg1, void* arg2) {
VP8Decoder* const dec = (VP8Decoder*)arg1;
VP8Io* const io = (VP8Io*)arg2;
int ok = 1;
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int cache_id = ctx->id_;
@ -448,10 +450,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
if (y_end > io->crop_bottom) {
y_end = io->crop_bottom; // make sure we don't overflow on last row.
}
// If dec->alpha_data_ is not NULL, we have some alpha plane present.
io->a = NULL;
if (dec->alpha_data_ != NULL && y_start < y_end) {
// TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
// good idea.
io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
if (io->a == NULL) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@ -558,7 +559,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
if (io->bypass_filtering) {
dec->filter_type_ = 0;
}
// TODO(skal): filter type / strength / sharpness forcing
// Define the area where we can skip in-loop filtering, in case of cropping.
//
@ -569,8 +569,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
// Means: there's a dependency chain that goes all the way up to the
// top-left corner of the picture (MB #0). We must filter all the previous
// macroblocks.
// TODO(skal): add an 'approximate_decoding' option, that won't produce
// a 1:1 bit-exactness for complex filtering?
{
const int extra_pixels = kFilterExtraRows[dec->filter_type_];
if (dec->filter_type_ == 2) {
@ -651,7 +649,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
}
worker->data1 = dec;
worker->data2 = (void*)&dec->thread_ctx_.io_;
worker->hook = (WebPWorkerHook)FinishRow;
worker->hook = FinishRow;
dec->num_caches_ =
(dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
} else {

@ -491,7 +491,7 @@ static int GetCoeffsAlt(VP8BitReader* const br,
return 16;
}
WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
if (GetCoeffs == NULL) {
if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
GetCoeffs = GetCoeffsAlt;

@ -30,9 +30,9 @@ extern "C" {
// Various defines and enums
// version numbers
#define DEC_MAJ_VERSION 0
#define DEC_MIN_VERSION 6
#define DEC_REV_VERSION 1
#define DEC_MAJ_VERSION 1
#define DEC_MIN_VERSION 0
#define DEC_REV_VERSION 0
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// Constraints are: We need to store one 16x16 block of luma samples (y),

@ -1643,17 +1643,17 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
#if !defined(WEBP_REDUCE_SIZE)
if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
// need the alpha-multiply functions for premultiplied output or rescaling
WebPInitAlphaProcessing();
}
#else
if (io->use_scaling) {
dec->status_ = VP8_STATUS_INVALID_PARAM;
goto Err;
}
#endif
if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
// need the alpha-multiply functions for premultiplied output or rescaling
WebPInitAlphaProcessing();
}
if (!WebPIsRGBMode(dec->output_->colorspace)) {
WebPInitConvertARGBToYUV();
if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();

@ -23,9 +23,9 @@
#include "src/webp/demux.h"
#include "src/webp/format_constants.h"
#define DMUX_MAJ_VERSION 0
#define DMUX_MIN_VERSION 3
#define DMUX_REV_VERSION 3
#define DMUX_MAJ_VERSION 1
#define DMUX_MIN_VERSION 0
#define DMUX_REV_VERSION 0
typedef struct {
size_t start_; // start location of the data

@ -366,6 +366,16 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
#ifdef WORDS_BIGENDIAN
static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
}
#endif
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
@ -381,6 +391,10 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
#ifdef WORDS_BIGENDIAN
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int, uint32_t*);
#endif
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
@ -395,16 +409,14 @@ extern void WebPInitAlphaProcessingSSE2(void);
extern void WebPInitAlphaProcessingSSE41(void);
extern void WebPInitAlphaProcessingNEON(void);
static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
(VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
WebPMultARGBRow = WebPMultARGBRow_C;
WebPMultRow = WebPMultRow_C;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
#ifdef WORDS_BIGENDIAN
WebPPackARGB = PackARGB_C;
#endif
WebPPackRGB = PackRGB_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
@ -451,9 +463,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
assert(WebPDispatchAlphaToGreen != NULL);
assert(WebPExtractAlpha != NULL);
assert(WebPExtractGreen != NULL);
#ifdef WORDS_BIGENDIAN
assert(WebPPackARGB != NULL);
#endif
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -125,6 +125,49 @@ static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
}
}
#ifdef WORDS_BIGENDIAN
static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out) {
int temp0, temp1, temp2, temp3, offset;
const int rest = len & 1;
const uint32_t* const loop_end = out + len - rest;
const int step = 4;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
#endif // WORDS_BIGENDIAN
static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, int step,
uint32_t* out) {
@ -172,6 +215,9 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
WebPMultARGBRow = MultARGBRow_MIPSdspR2;
#ifdef WORDS_BIGENDIAN
WebPPackARGB = PackARGB_MIPSdspR2;
#endif
WebPPackRGB = PackRGB_MIPSdspR2;
}

@ -1,68 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions.
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
const uint8_t*, int, uint32_t*);
void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
int, int, uint32_t*);
extern void VP8EncDspARGBInitMIPSdspR2(void);
extern void VP8EncDspARGBInitSSE2(void);
static volatile VP8CPUInfo argb_last_cpuinfo_used =
(VP8CPUInfo)&argb_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspARGBInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspARGBInitMIPSdspR2();
}
#endif
}
argb_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -1,110 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (mips version).
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int temp0, temp1, temp2, temp3, offset;
const int rest = len & 1;
const uint32_t* const loop_end = out + len - rest;
const int step = 4;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

@ -1,70 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (SSE2 version).
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
if (g == r + 1) { // RGBA input order. Need to swap R and B.
int i = 0;
const int len_max = len & ~3; // max length processed in main loop
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
assert(b == r + 2);
assert(a == r + 3);
for (; i < len_max; i += 4) {
const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0
const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A
const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F = _mm_or_si128(E, C);
_mm_storeu_si128((__m128i*)(out + i), F);
}
for (; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
} else {
assert(g == b + 1);
assert(r == b + 2);
assert(a == b + 3);
memcpy(out, b, len * 4);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
extern void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
const uint8_t*, int, uint32_t*);
VP8PackARGB = PackARGB;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
#endif // WEBP_USE_SSE2

@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5) {
static WEBP_INLINE void VP8PlanarTo24b_SSE2(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
// Convert four packed four-channel buffers like argbargbargbargb... into the
// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
// Column-wise transpose.
const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);

@ -0,0 +1,132 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 code common to several files.
//
// Author: Vincent Rabaud (vrabaud@google.com)
#ifndef WEBP_DSP_COMMON_SSE41_H_
#define WEBP_DSP_COMMON_SSE41_H_
#ifdef __cplusplus
extern "C" {
#endif
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
//------------------------------------------------------------------------------
// Channel mixing.
// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ...
#define WEBP_SSE41_SHUFF(OUT, IN0, IN1) \
OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \
OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \
OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \
OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \
OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \
OUT##5 = _mm_shuffle_epi8(*IN1, shuff2);
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void VP8PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5) {
__m128i R0, R1, R2, R3, R4, R5;
__m128i G0, G1, G2, G3, G4, G5;
__m128i B0, B1, B2, B3, B4, B5;
// Process R.
{
const __m128i shuff0 = _mm_set_epi8(
5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0);
const __m128i shuff1 = _mm_set_epi8(
-1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
-1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1);
WEBP_SSE41_SHUFF(R, in0, in1)
}
// Process G.
{
// Same as before, just shifted to the left by one and including the right
// padding.
const __m128i shuff0 = _mm_set_epi8(
-1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1);
const __m128i shuff1 = _mm_set_epi8(
10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5);
const __m128i shuff2 = _mm_set_epi8(
-1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1);
WEBP_SSE41_SHUFF(G, in2, in3)
}
// Process B.
{
const __m128i shuff0 = _mm_set_epi8(
-1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1);
const __m128i shuff2 = _mm_set_epi8(
15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10);
WEBP_SSE41_SHUFF(B, in4, in5)
}
// OR the different channels.
{
const __m128i RG0 = _mm_or_si128(R0, G0);
const __m128i RG1 = _mm_or_si128(R1, G1);
const __m128i RG2 = _mm_or_si128(R2, G2);
const __m128i RG3 = _mm_or_si128(R3, G3);
const __m128i RG4 = _mm_or_si128(R4, G4);
const __m128i RG5 = _mm_or_si128(R5, G5);
*in0 = _mm_or_si128(RG0, B0);
*in1 = _mm_or_si128(RG1, B1);
*in2 = _mm_or_si128(RG2, B2);
*in3 = _mm_or_si128(RG3, B3);
*in4 = _mm_or_si128(RG4, B4);
*in5 = _mm_or_si128(RG5, B5);
}
}
#undef WEBP_SSE41_SHUFF
// Convert four packed four-channel buffers like argbargbargbargb... into the
// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
// aaaarrrrggggbbbb
const __m128i shuff0 =
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0);
const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0);
const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0);
const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0);
// A0A1R0R1
// G0G1B0B1
// A2A3R2R3
// G0G1B0B1
const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
const __m128i B1 = _mm_unpackhi_epi32(A0, A1);
const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
const __m128i B3 = _mm_unpackhi_epi32(A2, A3);
*in3 = _mm_unpacklo_epi64(B0, B2);
*in2 = _mm_unpackhi_epi64(B0, B2);
*in1 = _mm_unpacklo_epi64(B1, B3);
*in0 = _mm_unpackhi_epi64(B1, B3);
}
#endif // WEBP_USE_SSE41
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_COMMON_SSE41_H_

@ -378,12 +378,7 @@ extern void VP8EncDspCostInitMIPS32(void);
extern void VP8EncDspCostInitMIPSdspR2(void);
extern void VP8EncDspCostInitSSE2(void);
static volatile VP8CPUInfo cost_last_cpuinfo_used =
(VP8CPUInfo)&cost_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
VP8GetResidualCost = GetResidualCost_C;
VP8SetResidualCoeffs = SetResidualCoeffs_C;
@ -405,8 +400,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
}
#endif
}
cost_last_cpuinfo_used = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------

@ -741,12 +741,7 @@ extern void VP8DspInitMIPS32(void);
extern void VP8DspInitMIPSdspR2(void);
extern void VP8DspInitMSA(void);
static volatile VP8CPUInfo dec_last_cpuinfo_used =
(VP8CPUInfo)&dec_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8DspInit) {
VP8InitClipTables();
#if !WEBP_NEON_OMIT_C_CODE
@ -889,6 +884,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
assert(VP8PredChroma8[5] != NULL);
assert(VP8PredChroma8[6] != NULL);
assert(VP8DitherCombine8x8 != NULL);
dec_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -141,6 +141,42 @@ extern "C" {
#endif
#endif
#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
#include <pthread.h> // NOLINT
#define WEBP_DSP_INIT(func) do { \
static volatile VP8CPUInfo func ## _last_cpuinfo_used = \
(VP8CPUInfo)&func ## _last_cpuinfo_used; \
static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
if (pthread_mutex_lock(&func ## _lock)) break; \
if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \
func ## _last_cpuinfo_used = VP8GetCPUInfo; \
(void)pthread_mutex_unlock(&func ## _lock); \
} while (0)
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
#define WEBP_DSP_INIT(func) do { \
static volatile VP8CPUInfo func ## _last_cpuinfo_used = \
(VP8CPUInfo)&func ## _last_cpuinfo_used; \
if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \
func(); \
func ## _last_cpuinfo_used = VP8GetCPUInfo; \
} while (0)
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
// Defines an Init + helper function that control multiple initialization of
// function pointers / tables.
/* Usage:
WEBP_DSP_INIT_FUNC(InitFunc) {
...function body
}
*/
#define WEBP_DSP_INIT_FUNC(name) \
static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
WEBP_TSAN_IGNORE_FUNCTION void name(void) { \
WEBP_DSP_INIT(name ## _body); \
} \
static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
#define WEBP_UBSAN_IGNORE_UNDEF
#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
#if defined(__clang__) && defined(__has_attribute)
@ -166,6 +202,13 @@ extern "C" {
#define WEBP_SWAP_16BIT_CSP 0
#endif
// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
#if !defined(WORDS_BIGENDIAN) && \
(defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
(defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
#define WORDS_BIGENDIAN
#endif
typedef enum {
kSSE2,
kSSE3,
@ -189,7 +232,7 @@ WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
// avoiding a compiler warning.
#define WEBP_DSP_INIT_STUB(func) \
extern void func(void); \
WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
void func(void) {}
//------------------------------------------------------------------------------
// Encoding
@ -578,6 +621,13 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
#ifdef WORDS_BIGENDIAN
// ARGB packing function: a/r/g/b input is rgba or bgra order.
extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out);
#endif
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);

@ -740,12 +740,7 @@ extern void VP8EncDspInitMIPS32(void);
extern void VP8EncDspInitMIPSdspR2(void);
extern void VP8EncDspInitMSA(void);
static volatile VP8CPUInfo enc_last_cpuinfo_used =
(VP8CPUInfo)&enc_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
VP8DspInit(); // common inverse transforms
InitTables();
@ -838,6 +833,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
enc_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -238,12 +238,7 @@ extern void VP8FiltersInitMSA(void);
extern void VP8FiltersInitNEON(void);
extern void VP8FiltersInitSSE2(void);
static volatile VP8CPUInfo filters_last_cpuinfo_used =
(VP8CPUInfo)&filters_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
#if !WEBP_NEON_OMIT_C_CODE
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
@ -289,6 +284,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
filters_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -577,9 +577,6 @@ extern void VP8LDspInitNEON(void);
extern void VP8LDspInitMIPSdspR2(void);
extern void VP8LDspInitMSA(void);
static volatile VP8CPUInfo lossless_last_cpuinfo_used =
(VP8CPUInfo)&lossless_last_cpuinfo_used;
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0_C; \
(OUT)[1] = IN##1_C; \
@ -599,9 +596,7 @@ static volatile VP8CPUInfo lossless_last_cpuinfo_used =
(OUT)[15] = IN##0_C; \
} while (0);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8LDspInit) {
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
@ -658,8 +653,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
assert(VP8LConvertBGRAToRGB565 != NULL);
assert(VP8LMapColor32b != NULL);
assert(VP8LMapColor8b != NULL);
lossless_last_cpuinfo_used = VP8GetCPUInfo;
}
#undef COPY_PREDICTOR_ARRAY

@ -25,10 +25,6 @@
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "src/enc/delta_palettization_enc.h"
#endif // WEBP_EXPERIMENTAL_FEATURES
//------------------------------------------------------------------------------
// Decoding

@ -863,12 +863,7 @@ extern void VP8LEncDspInitMIPS32(void);
extern void VP8LEncDspInitMIPSdspR2(void);
extern void VP8LEncDspInitMSA(void);
static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
(VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
if (lossless_enc_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
VP8LDspInit();
#if !WEBP_NEON_OMIT_C_CODE
@ -1011,8 +1006,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
assert(VP8LPredictorsSub_C[13] != NULL);
assert(VP8LPredictorsSub_C[14] != NULL);
assert(VP8LPredictorsSub_C[15] != NULL);
lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------

@ -46,16 +46,14 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = _mm_set_epi16(
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
const __m128i mults_b2 = _mm_set_epi16(
CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
int i;
@ -85,12 +83,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set_epi16(
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
int y;
@ -135,9 +129,7 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff);
@ -174,6 +166,7 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
}
}
#undef SPAN
#undef MK_CST_16
//------------------------------------------------------------------------------

@ -18,6 +18,9 @@
#include <smmintrin.h>
#include "src/dsp/lossless.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
@ -38,6 +41,95 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
}
}
//------------------------------------------------------------------------------
// Color Transform
#define SPAN 8
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi16(0xff00); // green mask
const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask
const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask
const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
-1, -1, -1, -1, -1, -1, -1);
const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
2, -1, 6, -1, 10, -1, 14);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
const __m128i r = _mm_or_si128(r0, r1); // r 0
const __m128i gb0 = _mm_and_si128(in0, mask_gb);
const __m128i gb1 = _mm_and_si128(in1, mask_gb);
const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b
const __m128i g = _mm_and_si128(gb, mask_g); // g 0
const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg
const __m128i C = _mm_sub_epi8(gb, B); // x b'
const __m128i D = _mm_sub_epi8(C, A); // x b''
const __m128i E = _mm_and_si128(D, mask_b); // 0 b''
_mm_storeu_si128((__m128i*)values, E);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi16(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i g1 = _mm_and_si128(in1, mask_g);
const __m128i g = _mm_packus_epi32(g0, g1); // g 0
const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i A1 = _mm_srli_epi32(in1, 16);
const __m128i A = _mm_packus_epi32(A0, A1); // x r
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr
const __m128i C = _mm_sub_epi8(A, B); // x r'
const __m128i D = _mm_and_si128(C, mask); // 0 r'
_mm_storeu_si128((__m128i*)values, D);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height, green_to_red,
histo);
}
}
}
//------------------------------------------------------------------------------
// Entry point
@ -45,6 +137,8 @@ extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
}
#else // !WEBP_USE_SSE41

@ -453,14 +453,11 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_));
const __m128i mults_b2 = _mm_set_epi16(
CST(red_to_blue_), 0, CST(red_to_blue_), 0,
CST(red_to_blue_), 0, CST(red_to_blue_), 0);
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
#undef MK_CST_16
#undef CST
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
int i;
@ -503,11 +500,11 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
__m128i in5 = _mm_loadu_si128(in + 5);
__m128i in6 = _mm_loadu_si128(in + 6);
__m128i in7 = _mm_loadu_si128(in + 7);
VP8L32bToPlanar(&in0, &in1, &in2, &in3);
VP8L32bToPlanar(&in4, &in5, &in6, &in7);
VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
// At this points, in1/in5 contains red only, in2/in6 green only ...
// Pack the colors in 24b RGB.
VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
_mm_storeu_si128(out + 0, in1);
_mm_storeu_si128(out + 1, in5);
_mm_storeu_si128(out + 2, in2);

@ -204,11 +204,7 @@ extern void WebPRescalerDspInitMIPSdspR2(void);
extern void WebPRescalerDspInitMSA(void);
extern void WebPRescalerDspInitNEON(void);
static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
(VP8CPUInfo)&rescaler_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
#if !defined(WEBP_REDUCE_SIZE)
#if !WEBP_NEON_OMIT_C_CODE
WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
@ -253,5 +249,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
assert(WebPRescalerImportRowExpand != NULL);
assert(WebPRescalerImportRowShrink != NULL);
#endif // WEBP_REDUCE_SIZE
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -36,7 +36,7 @@ static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
@ -50,13 +50,15 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
int accum = x_add;
__m128i cur_pixels;
// SSE2 implementation only works with 16b signed arithmetic at max.
if (wrk->src_width < 8 || accum >= (1 << 15)) {
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
while (1) {
@ -75,11 +77,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
} else {
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadHeightPixels_SSE2(src, &cur_pixels);
LoadEightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
@ -94,7 +92,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadHeightPixels_SSE2(src, &cur_pixels);
LoadEightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail

@ -139,12 +139,7 @@ VP8AccumulateSSEFunc VP8AccumulateSSE;
extern void VP8SSIMDspInitSSE2(void);
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
(VP8CPUInfo)&ssim_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
@ -161,6 +156,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
}
#endif
}
ssim_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -217,13 +217,9 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
extern void WebPInitYUV444ConvertersSSE2(void);
extern void WebPInitYUV444ConvertersSSE41(void);
static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
(VP8CPUInfo)&upsampling_last_cpuinfo_used1;
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
@ -242,29 +238,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
WebPInitYUV444ConvertersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitYUV444ConvertersSSE41();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitYUV444ConvertersMIPSdspR2();
}
#endif
}
upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------
// Main calls
extern void WebPInitUpsamplersSSE2(void);
extern void WebPInitUpsamplersSSE41(void);
extern void WebPInitUpsamplersNEON(void);
extern void WebPInitUpsamplersMIPSdspR2(void);
extern void WebPInitUpsamplersMSA(void);
static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
(VP8CPUInfo)&upsampling_last_cpuinfo_used2;
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
#ifdef FANCY_UPSAMPLING
#if !WEBP_NEON_OMIT_C_CODE
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
@ -287,6 +283,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
WebPInitUpsamplersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitUpsamplersSSE41();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitUpsamplersMIPSdspR2();
@ -310,6 +311,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
assert(WebPUpsamplers[MODE_BGRA] != NULL);
assert(WebPUpsamplers[MODE_rgbA] != NULL);
assert(WebPUpsamplers[MODE_bgrA] != NULL);
#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE
assert(WebPUpsamplers[MODE_RGB] != NULL);
assert(WebPUpsamplers[MODE_BGR] != NULL);
assert(WebPUpsamplers[MODE_ARGB] != NULL);
@ -317,9 +319,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
assert(WebPUpsamplers[MODE_RGB_565] != NULL);
assert(WebPUpsamplers[MODE_Argb] != NULL);
assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
#endif
#endif // FANCY_UPSAMPLING
upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------

@ -264,6 +264,7 @@ static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
bgr[2] = Clip8(r1 >> 6);
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
const int y1 = MultHi(y, 19077);
const int r1 = y1 + MultHi(v, 26149) - 14234;
@ -306,6 +307,7 @@ static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
argb[0] = 0xff;
YuvToRgb(y, u, v, argb + 1);
}
#endif // WEBP_REDUCE_CSP
static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
YuvToBgr(y, u, v, bgra);
@ -317,6 +319,7 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
rgba[3] = 0xff;
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
@ -370,6 +373,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
memcpy(dst, temp, length * 3 * sizeof(*dst));
}
}
#endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
@ -427,6 +431,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
}
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
@ -526,6 +531,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
memcpy(dst, temp, length * 2 * sizeof(*dst));
}
}
#endif // WEBP_REDUCE_CSP
#define UPSAMPLE_32PIXELS(a, b, c, d) do { \
v16u8 s = __msa_aver_u_b(a, d); \

@ -104,21 +104,6 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
Upsample32Pixels_SSE2(r1, r2, out); \
}
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
for (n = 0; n < (num_pixels); ++n) { \
FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n], \
(top_dst) + ((cur_x) + n) * (XSTEP)); \
} \
if ((bottom_y) != NULL) { \
for (n = 0; n < (num_pixels); ++n) { \
FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
(bottom_dst) + ((cur_x) + n) * (XSTEP)); \
} \
} \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[4 * 32 + 15]; \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
uint8_t* const r_v = r_u + 32; \
\
@ -160,11 +145,22 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
} \
if (len > 1) { \
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
uint8_t* const tmp_top_dst = r_u + 4 * 32; \
uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \
uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \
uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \
assert(left_over > 0); \
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \
pos, len - pos); \
memcpy(tmp_top, top_y + pos, len - pos); \
if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \
CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \
tmp_bottom_dst, 0); \
memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \
if (bottom_y != NULL) { \
memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \
(len - pos) * (XSTEP)); \
} \
} \
}

@ -0,0 +1,239 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE41 version of YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include <string.h>
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
#if !defined(WEBP_REDUCE_CSP)
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
// u = (9*a + 3*b + 3*c + d + 8) / 16
// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
// = (a + m + 1) / 2
// where m = (a + 3*b + 3*c + d) / 8
// = ((a + b + c + d) / 2 + b + c) / 4
//
// Let's say k = (a + b + c + d) / 4.
// We can compute k as
// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
#define GET_M(ij, in, out) do { \
const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \
const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \
const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \
const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\
const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
// pack and store two alternating pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \
const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \
_mm_store_si128(((__m128i*)(out)) + 0, t_1); \
_mm_store_si128(((__m128i*)(out)) + 1, t_2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
#define UPSAMPLE_32PIXELS(r1, r2, out) { \
const __m128i one = _mm_set1_epi8(1); \
const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \
const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \
const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \
const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \
\
const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \
const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \
const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \
\
const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \
const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \
\
const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \
const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \
const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \
const __m128i t4 = _mm_avg_epu8(s, t); \
const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \
__m128i diag1, diag2; \
\
GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[17], r2[17]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels_SSE41(r1, r2, out); \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
if ((bottom_y) != NULL) { \
FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
(bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
uint8_t* const r_v = r_u + 32; \
\
assert(top_y != NULL); \
{ /* Treat the first pixel in regular way */ \
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
const int u0_t = (top_u[0] + u_diag) >> 1; \
const int v0_t = (top_v[0] + v_diag) >> 1; \
FUNC(top_y[0], u0_t, v0_t, top_dst); \
if (bottom_y != NULL) { \
const int u0_b = (cur_u[0] + u_diag) >> 1; \
const int v0_b = (cur_v[0] + v_diag) >> 1; \
FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
} \
} \
/* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
} \
if (len > 1) { \
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
uint8_t* const tmp_top_dst = r_u + 4 * 32; \
uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \
uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \
uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \
assert(left_over > 0); \
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
memcpy(tmp_top, top_y + pos, len - pos); \
if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \
CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \
tmp_bottom_dst, 0); \
memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \
if (bottom_y != NULL) { \
memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \
(len - pos) * (XSTEP)); \
} \
} \
}
// SSE4 variants of the fancy upsampler.
SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41, VP8YuvToRgb, 3)
SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41, VP8YuvToBgr, 3)
#undef GET_M
#undef PACK_AND_STORE
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
#undef CONVERT2RGB_32
#undef SSE4_UPSAMPLE_FUNC
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) {
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE41;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE41;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
} \
if (i < len) { /* C-fallback */ \
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE41;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE41;
#endif // WEBP_REDUCE_CSP
}
#else
WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41)
#endif // WEBP_USE_SSE41
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41)
#endif

@ -71,15 +71,11 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
WebPSamplerRowFunc WebPSamplers[MODE_LAST];
extern void WebPInitSamplersSSE2(void);
extern void WebPInitSamplersSSE41(void);
extern void WebPInitSamplersMIPS32(void);
extern void WebPInitSamplersMIPSdspR2(void);
static volatile VP8CPUInfo yuv_last_cpuinfo_used =
(VP8CPUInfo)&yuv_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
@ -99,6 +95,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
WebPInitSamplersSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitSamplersSSE41();
}
#endif // WEBP_USE_SSE41
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPInitSamplersMIPS32();
@ -110,7 +111,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
}
#endif // WEBP_USE_MIPS_DSP_R2
}
yuv_last_cpuinfo_used = VP8GetCPUInfo;
}
//-----------------------------------------------------------------------------
@ -254,17 +254,13 @@ void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out);
static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
(VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVSSE41(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
@ -286,6 +282,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
WebPInitSharpYUVSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitConvertARGBToYUVSSE41();
}
#endif // WEBP_USE_SSE41
}
#if defined(WEBP_USE_NEON)
@ -304,6 +305,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
}

@ -166,6 +166,19 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
#endif // WEBP_USE_SSE2
//-----------------------------------------------------------------------------
// SSE41 extra functions (mostly for upsampling_sse41.c)
#if defined(WEBP_USE_SSE41)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
#endif // WEBP_USE_SSE41
//------------------------------------------------------------------------------
// RGB -> YUV conversion

@ -180,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
_mm_storeu_si128((__m128i*)(rgb + 0), *in0);
_mm_storeu_si128((__m128i*)(rgb + 16), *in1);
@ -492,7 +492,7 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i a1 = LOAD_16(argb + 4);
__m128i a2 = LOAD_16(argb + 8);
__m128i a3 = LOAD_16(argb + 12);
VP8L32bToPlanar(&a0, &a1, &a2, &a3);
VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
rgb[0] = _mm_unpacklo_epi8(a1, zero);
rgb[1] = _mm_unpackhi_epi8(a1, zero);
rgb[2] = _mm_unpacklo_epi8(a2, zero);

@ -0,0 +1,613 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/yuv.h"
#if defined(WEBP_USE_SSE41)
#include "src/dsp/common_sse41.h"
#include <stdlib.h>
#include <smmintrin.h>
//-----------------------------------------------------------------------------
// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
const __m128i* const U0,
const __m128i* const V0,
__m128i* const R,
__m128i* const G,
__m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234);
// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
const __m128i k33050 = _mm_set1_epi16((short)33050);
const __m128i k17685 = _mm_set1_epi16(17685);
const __m128i k6419 = _mm_set1_epi16(6419);
const __m128i k13320 = _mm_set1_epi16(13320);
const __m128i k8708 = _mm_set1_epi16(8708);
const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
const __m128i R1 = _mm_sub_epi16(Y1, k14234);
const __m128i R2 = _mm_add_epi16(R1, R0);
const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
const __m128i G2 = _mm_add_epi16(Y1, k8708);
const __m128i G3 = _mm_add_epi16(G0, G1);
const __m128i G4 = _mm_sub_epi16(G2, G3);
// be careful with the saturated *unsigned* arithmetic here!
const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
const __m128i B1 = _mm_adds_epu16(B0, Y1);
const __m128i B2 = _mm_subs_epu16(B1, k17685);
// use logical shift for B2, which can be larger than 32767
*R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815]
*G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710]
*B = _mm_srli_epi16(B2, 6); // range: [0, 34238]
}
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
}
// Load and replicate the U/V samples
static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
V0 = Load_HI_16_SSE41(v);
ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
V0 = Load_UV_HI_8_SSE41(v);
ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
// around as follows:
// Input:
// r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
// Split the 6 registers in two sets of 3 registers: the first set as the even
// 8b bytes, the second the odd ones:
// r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
_mm_storeu_si128((__m128i*)(rgb + 0), *in0);
_mm_storeu_si128((__m128i*)(rgb + 16), *in1);
_mm_storeu_si128((__m128i*)(rgb + 32), *in2);
_mm_storeu_si128((__m128i*)(rgb + 48), *in3);
_mm_storeu_si128((__m128i*)(rgb + 64), *in4);
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToRgb(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToBgrRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToBgr(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41;
WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41;
}
//------------------------------------------------------------------------------
// RGB24/32 -> YUV converters
// Load eight 16b-words from *src.
#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
// Store either 16b-words into *dst
#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
#define WEBP_SSE41_SHUFF(OUT) do { \
const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
\
/* OR everything to get one channel */ \
const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \
const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \
out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \
out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \
} while (0);
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
// Compute RR.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(0)
}
// Compute GG.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(2)
}
// Compute BB.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(4)
}
}
#undef WEBP_SSE41_SHUFF
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
__m128i a2 = LOAD_16(argb + 8);
__m128i a3 = LOAD_16(argb + 12);
VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
rgb[0] = _mm_unpacklo_epi8(a1, zero);
rgb[1] = _mm_unpackhi_epi8(a1, zero);
rgb[2] = _mm_unpacklo_epi8(a2, zero);
rgb[3] = _mm_unpackhi_epi8(a2, zero);
rgb[4] = _mm_unpacklo_epi8(a3, zero);
rgb[5] = _mm_unpackhi_epi8(a3, zero);
}
// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
// It's a macro and not a function because we need to use immediate values with
// srai_epi32, e.g.
#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
ROUNDER, DESCALE_FIX, OUT) do { \
const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U,
__m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
const __m128i kGB_v = MK_CST_16(-24116, -4684);
const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
kHALF_UV, YUV_FIX + 2, *U);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
kHALF_UV, YUV_FIX + 2, *V);
}
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
__m128i rgb_plane[6];
int j;
RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
__m128i bgr_plane[6];
int j;
RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar_SSE41(&argb[i], rgb);
ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack_SSE41(const __m128i* const A,
const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
uint8_t* u, uint8_t* v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar_SSE41(&argb[i], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
if (!do_store) {
const __m128i prev_u = LOAD_16(u);
const __m128i prev_v = LOAD_16(v);
U0 = _mm_avg_epu8(U0, prev_u);
V0 = _mm_avg_epu8(V0, prev_v);
}
STORE_16(U0, u);
STORE_16(V0, v);
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
const uint16_t* const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
// aarrggbb as 16-bit.
const __m128i shuff0 =
_mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
const __m128i shuff1 =
_mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
// R0R1G0G1
// B0B1****
// R2R3G2G3
// B2B3****
// (OR is used to free port 5 for the unpack)
const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
const __m128i B1 = _mm_or_si128(A0, A1);
const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
const __m128i B3 = _mm_or_si128(A2, A3);
// Gather the channels.
*r = _mm_unpacklo_epi64(B0, B2);
*g = _mm_unpackhi_epi64(B0, B2);
*b = _mm_unpackhi_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b_SSE41(rgb + 0, &r, &g, &b);
ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
v += 16;
rgb += 2 * 32;
}
if (max_width < width) { // left-over
WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
WebPConvertARGBToY = ConvertARGBToY_SSE41;
WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
}
//------------------------------------------------------------------------------
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
#endif // WEBP_USE_SSE41

@ -361,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc,
//------------------------------------------------------------------------------
// Main calls
static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
static int CompressAlphaJob(void* arg1, void* dummy) {
VP8Encoder* const enc = (VP8Encoder*)arg1;
const WebPConfig* config = enc->config_;
uint8_t* alpha_data = NULL;
size_t alpha_size = 0;
@ -394,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
WebPGetWorkerInterface()->Init(worker);
worker->data1 = enc;
worker->data2 = NULL;
worker->hook = (WebPWorkerHook)CompressAlphaJob;
worker->hook = CompressAlphaJob;
}
}

@ -434,7 +434,9 @@ typedef struct {
} SegmentJob;
// main work call
static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
static int DoSegmentsJob(void* arg1, void* arg2) {
SegmentJob* const job = (SegmentJob*)arg1;
VP8EncIterator* const it = (VP8EncIterator*)arg2;
int ok = 1;
if (!VP8IteratorIsDone(it)) {
uint8_t tmp[32 + WEBP_ALIGN_CST];
@ -462,7 +464,7 @@ static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
WebPGetWorkerInterface()->Init(&job->worker);
job->worker.data1 = job;
job->worker.data2 = &job->it;
job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
job->worker.hook = DoSegmentsJob;
VP8IteratorInit(enc, &job->it);
VP8IteratorSetRow(&job->it, start_row);
VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);

@ -1,455 +0,0 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Mislav Bradac (mislavm@google.com)
//
#include "src/enc/delta_palettization_enc.h"
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "src/webp/types.h"
#include "src/dsp/lossless.h"
#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
// Format allows palette up to 256 entries, but more palette entries produce
// bigger entropy. In the future it will probably be useful to add more entries
// that are far from the origin of the palette or choose remaining entries
// dynamically.
#define DELTA_PALETTE_SIZE 226
// Palette used for delta_palettization. Entries are roughly sorted by distance
// of their signed equivalents from the origin.
static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
MK_COL(0u, 0u, 0u),
MK_COL(255u, 255u, 255u),
MK_COL(1u, 1u, 1u),
MK_COL(254u, 254u, 254u),
MK_COL(2u, 2u, 2u),
MK_COL(4u, 4u, 4u),
MK_COL(252u, 252u, 252u),
MK_COL(250u, 0u, 0u),
MK_COL(0u, 250u, 0u),
MK_COL(0u, 0u, 250u),
MK_COL(6u, 0u, 0u),
MK_COL(0u, 6u, 0u),
MK_COL(0u, 0u, 6u),
MK_COL(0u, 0u, 248u),
MK_COL(0u, 0u, 8u),
MK_COL(0u, 248u, 0u),
MK_COL(0u, 248u, 248u),
MK_COL(0u, 248u, 8u),
MK_COL(0u, 8u, 0u),
MK_COL(0u, 8u, 248u),
MK_COL(0u, 8u, 8u),
MK_COL(8u, 8u, 8u),
MK_COL(248u, 0u, 0u),
MK_COL(248u, 0u, 248u),
MK_COL(248u, 0u, 8u),
MK_COL(248u, 248u, 0u),
MK_COL(248u, 8u, 0u),
MK_COL(8u, 0u, 0u),
MK_COL(8u, 0u, 248u),
MK_COL(8u, 0u, 8u),
MK_COL(8u, 248u, 0u),
MK_COL(8u, 8u, 0u),
MK_COL(23u, 23u, 23u),
MK_COL(13u, 13u, 13u),
MK_COL(232u, 232u, 232u),
MK_COL(244u, 244u, 244u),
MK_COL(245u, 245u, 250u),
MK_COL(50u, 50u, 50u),
MK_COL(204u, 204u, 204u),
MK_COL(236u, 236u, 236u),
MK_COL(16u, 16u, 16u),
MK_COL(240u, 16u, 16u),
MK_COL(16u, 240u, 16u),
MK_COL(240u, 240u, 16u),
MK_COL(16u, 16u, 240u),
MK_COL(240u, 16u, 240u),
MK_COL(16u, 240u, 240u),
MK_COL(240u, 240u, 240u),
MK_COL(0u, 0u, 232u),
MK_COL(0u, 232u, 0u),
MK_COL(232u, 0u, 0u),
MK_COL(0u, 0u, 24u),
MK_COL(0u, 24u, 0u),
MK_COL(24u, 0u, 0u),
MK_COL(32u, 32u, 32u),
MK_COL(224u, 32u, 32u),
MK_COL(32u, 224u, 32u),
MK_COL(224u, 224u, 32u),
MK_COL(32u, 32u, 224u),
MK_COL(224u, 32u, 224u),
MK_COL(32u, 224u, 224u),
MK_COL(224u, 224u, 224u),
MK_COL(0u, 0u, 176u),
MK_COL(0u, 0u, 80u),
MK_COL(0u, 176u, 0u),
MK_COL(0u, 176u, 176u),
MK_COL(0u, 176u, 80u),
MK_COL(0u, 80u, 0u),
MK_COL(0u, 80u, 176u),
MK_COL(0u, 80u, 80u),
MK_COL(176u, 0u, 0u),
MK_COL(176u, 0u, 176u),
MK_COL(176u, 0u, 80u),
MK_COL(176u, 176u, 0u),
MK_COL(176u, 80u, 0u),
MK_COL(80u, 0u, 0u),
MK_COL(80u, 0u, 176u),
MK_COL(80u, 0u, 80u),
MK_COL(80u, 176u, 0u),
MK_COL(80u, 80u, 0u),
MK_COL(0u, 0u, 152u),
MK_COL(0u, 0u, 104u),
MK_COL(0u, 152u, 0u),
MK_COL(0u, 152u, 152u),
MK_COL(0u, 152u, 104u),
MK_COL(0u, 104u, 0u),
MK_COL(0u, 104u, 152u),
MK_COL(0u, 104u, 104u),
MK_COL(152u, 0u, 0u),
MK_COL(152u, 0u, 152u),
MK_COL(152u, 0u, 104u),
MK_COL(152u, 152u, 0u),
MK_COL(152u, 104u, 0u),
MK_COL(104u, 0u, 0u),
MK_COL(104u, 0u, 152u),
MK_COL(104u, 0u, 104u),
MK_COL(104u, 152u, 0u),
MK_COL(104u, 104u, 0u),
MK_COL(216u, 216u, 216u),
MK_COL(216u, 216u, 40u),
MK_COL(216u, 216u, 176u),
MK_COL(216u, 216u, 80u),
MK_COL(216u, 40u, 216u),
MK_COL(216u, 40u, 40u),
MK_COL(216u, 40u, 176u),
MK_COL(216u, 40u, 80u),
MK_COL(216u, 176u, 216u),
MK_COL(216u, 176u, 40u),
MK_COL(216u, 176u, 176u),
MK_COL(216u, 176u, 80u),
MK_COL(216u, 80u, 216u),
MK_COL(216u, 80u, 40u),
MK_COL(216u, 80u, 176u),
MK_COL(216u, 80u, 80u),
MK_COL(40u, 216u, 216u),
MK_COL(40u, 216u, 40u),
MK_COL(40u, 216u, 176u),
MK_COL(40u, 216u, 80u),
MK_COL(40u, 40u, 216u),
MK_COL(40u, 40u, 40u),
MK_COL(40u, 40u, 176u),
MK_COL(40u, 40u, 80u),
MK_COL(40u, 176u, 216u),
MK_COL(40u, 176u, 40u),
MK_COL(40u, 176u, 176u),
MK_COL(40u, 176u, 80u),
MK_COL(40u, 80u, 216u),
MK_COL(40u, 80u, 40u),
MK_COL(40u, 80u, 176u),
MK_COL(40u, 80u, 80u),
MK_COL(80u, 216u, 216u),
MK_COL(80u, 216u, 40u),
MK_COL(80u, 216u, 176u),
MK_COL(80u, 216u, 80u),
MK_COL(80u, 40u, 216u),
MK_COL(80u, 40u, 40u),
MK_COL(80u, 40u, 176u),
MK_COL(80u, 40u, 80u),
MK_COL(80u, 176u, 216u),
MK_COL(80u, 176u, 40u),
MK_COL(80u, 176u, 176u),
MK_COL(80u, 176u, 80u),
MK_COL(80u, 80u, 216u),
MK_COL(80u, 80u, 40u),
MK_COL(80u, 80u, 176u),
MK_COL(80u, 80u, 80u),
MK_COL(0u, 0u, 192u),
MK_COL(0u, 0u, 64u),
MK_COL(0u, 0u, 128u),
MK_COL(0u, 192u, 0u),
MK_COL(0u, 192u, 192u),
MK_COL(0u, 192u, 64u),
MK_COL(0u, 192u, 128u),
MK_COL(0u, 64u, 0u),
MK_COL(0u, 64u, 192u),
MK_COL(0u, 64u, 64u),
MK_COL(0u, 64u, 128u),
MK_COL(0u, 128u, 0u),
MK_COL(0u, 128u, 192u),
MK_COL(0u, 128u, 64u),
MK_COL(0u, 128u, 128u),
MK_COL(176u, 216u, 216u),
MK_COL(176u, 216u, 40u),
MK_COL(176u, 216u, 176u),
MK_COL(176u, 216u, 80u),
MK_COL(176u, 40u, 216u),
MK_COL(176u, 40u, 40u),
MK_COL(176u, 40u, 176u),
MK_COL(176u, 40u, 80u),
MK_COL(176u, 176u, 216u),
MK_COL(176u, 176u, 40u),
MK_COL(176u, 176u, 176u),
MK_COL(176u, 176u, 80u),
MK_COL(176u, 80u, 216u),
MK_COL(176u, 80u, 40u),
MK_COL(176u, 80u, 176u),
MK_COL(176u, 80u, 80u),
MK_COL(192u, 0u, 0u),
MK_COL(192u, 0u, 192u),
MK_COL(192u, 0u, 64u),
MK_COL(192u, 0u, 128u),
MK_COL(192u, 192u, 0u),
MK_COL(192u, 192u, 192u),
MK_COL(192u, 192u, 64u),
MK_COL(192u, 192u, 128u),
MK_COL(192u, 64u, 0u),
MK_COL(192u, 64u, 192u),
MK_COL(192u, 64u, 64u),
MK_COL(192u, 64u, 128u),
MK_COL(192u, 128u, 0u),
MK_COL(192u, 128u, 192u),
MK_COL(192u, 128u, 64u),
MK_COL(192u, 128u, 128u),
MK_COL(64u, 0u, 0u),
MK_COL(64u, 0u, 192u),
MK_COL(64u, 0u, 64u),
MK_COL(64u, 0u, 128u),
MK_COL(64u, 192u, 0u),
MK_COL(64u, 192u, 192u),
MK_COL(64u, 192u, 64u),
MK_COL(64u, 192u, 128u),
MK_COL(64u, 64u, 0u),
MK_COL(64u, 64u, 192u),
MK_COL(64u, 64u, 64u),
MK_COL(64u, 64u, 128u),
MK_COL(64u, 128u, 0u),
MK_COL(64u, 128u, 192u),
MK_COL(64u, 128u, 64u),
MK_COL(64u, 128u, 128u),
MK_COL(128u, 0u, 0u),
MK_COL(128u, 0u, 192u),
MK_COL(128u, 0u, 64u),
MK_COL(128u, 0u, 128u),
MK_COL(128u, 192u, 0u),
MK_COL(128u, 192u, 192u),
MK_COL(128u, 192u, 64u),
MK_COL(128u, 192u, 128u),
MK_COL(128u, 64u, 0u),
MK_COL(128u, 64u, 192u),
MK_COL(128u, 64u, 64u),
MK_COL(128u, 64u, 128u),
MK_COL(128u, 128u, 0u),
MK_COL(128u, 128u, 192u),
MK_COL(128u, 128u, 64u),
MK_COL(128u, 128u, 128u),
};
#undef MK_COL
//------------------------------------------------------------------------------
// TODO(skal): move the functions to dsp/lossless.c when the correct
// granularity is found. For now, we'll just copy-paste some useful bits
// here instead.
// In-place sum of each component with mod 256.
static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
*a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
}
static WEBP_INLINE uint32_t Clip255(uint32_t a) {
if (a < 256) {
return a;
}
// return 0, when a is a negative integer.
// return 255, when a is positive.
return ~a >> 24;
}
// Delta palettization functions.
static WEBP_INLINE int Square(int x) {
return x * x;
}
static WEBP_INLINE uint32_t Intensity(uint32_t a) {
return
30 * ((a >> 16) & 0xff) +
59 * ((a >> 8) & 0xff) +
11 * ((a >> 0) & 0xff);
}
static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
uint32_t palette_entry) {
int i;
uint32_t distance = 0;
AddPixelsEq(&predicted_value, palette_entry);
for (i = 0; i < 32; i += 8) {
const int32_t av = (actual_value >> i) & 0xff;
const int32_t pv = (predicted_value >> i) & 0xff;
distance += Square(pv - av);
}
// We sum square of intensity difference with factor 10, but because Intensity
// returns 100 times real intensity we need to multiply differences of colors
// by 1000.
distance *= 1000u;
distance += Square(Intensity(predicted_value)
- Intensity(actual_value));
return distance;
}
static uint32_t Predict(int x, int y, uint32_t* image) {
const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
const uint32_t p =
(((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
(((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
(((((t >> 8) & 0xff) + ((l >> 8) & 0xff)) / 2) << 8) +
(((((t >> 0) & 0xff) + ((l >> 0) & 0xff)) / 2) << 0);
if (x == 0 && y == 0) return ARGB_BLACK;
if (x == 0) return t;
if (y == 0) return l;
return p;
}
static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
int a, int b, int c) {
return Clip255(a + ((b - c) >> 2));
}
static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
uint32_t c0, uint32_t c1, uint32_t c2) {
const int a = AddSubtractComponentFullWithCoefficient(
c0 >> 24, c1 >> 24, c2 >> 24);
const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
(c1 >> 16) & 0xff,
(c2 >> 16) & 0xff);
const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
(c1 >> 8) & 0xff,
(c2 >> 8) & 0xff);
const int b = AddSubtractComponentFullWithCoefficient(
c0 & 0xff, c1 & 0xff, c2 & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
//------------------------------------------------------------------------------
// Find palette entry with minimum error from difference of actual pixel value
// and predicted pixel value. Propagate error of pixel to its top and left pixel
// in src array. Write predicted_value + palette_entry to new_image. Return
// index of best palette entry.
static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
const uint32_t palette[], int palette_size) {
int i;
int idx = 0;
uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
for (i = 1; i < palette_size; ++i) {
const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
if (distance < best_distance) {
best_distance = distance;
idx = i;
}
}
return idx;
}
static void ApplyBestPaletteEntry(int x, int y,
uint32_t new_value, uint32_t palette_value,
uint32_t* src, int src_stride,
uint32_t* new_image) {
AddPixelsEq(&new_value, palette_value);
if (x > 0) {
src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
new_value, src[x]);
}
if (y > 0) {
src[x - src_stride] =
ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
new_value, src[x]);
}
new_image[x] = new_value;
}
//------------------------------------------------------------------------------
// Main entry point
static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
uint32_t src_stride,
uint32_t dst_stride,
const uint32_t* palette,
int palette_size,
int width, int height,
int num_passes) {
int x, y;
WebPEncodingError err = VP8_ENC_OK;
uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
if (new_image == NULL || tmp_row == NULL) {
err = VP8_ENC_ERROR_OUT_OF_MEMORY;
goto Error;
}
while (num_passes--) {
uint32_t* cur_src = src;
uint32_t* cur_dst = dst;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
const uint32_t predicted_value = Predict(x, y, new_image);
tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
palette, palette_size);
ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
cur_src, src_stride, new_image);
}
for (x = 0; x < width; ++x) {
cur_dst[x] = palette[tmp_row[x]];
}
cur_src += src_stride;
cur_dst += dst_stride;
}
}
Error:
WebPSafeFree(new_image);
WebPSafeFree(tmp_row);
return err;
}
// replaces enc->argb_ by a palettizable approximation of it,
// and generates optimal enc->palette_[]
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
const WebPPicture* const pic = enc->pic_;
uint32_t* src = pic->argb;
uint32_t* dst = enc->argb_;
const int width = pic->width;
const int height = pic->height;
WebPEncodingError err = VP8_ENC_OK;
memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
enc->palette_size_ = DELTA_PALETTE_SIZE;
err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
enc->palette_, enc->palette_size_,
width, height, 2);
if (err != VP8_ENC_OK) goto Error;
Error:
return err;
}
#else // !WEBP_EXPERIMENTAL_FEATURES
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
(void)enc;
return VP8_ENC_ERROR_INVALID_CONFIGURATION;
}
#endif // WEBP_EXPERIMENTAL_FEATURES

@ -1,25 +0,0 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Mislav Bradac (mislavm@google.com)
//
#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
#include "src/webp/encode.h"
#include "src/enc/vp8li_enc.h"
// Replaces enc->argb_[] input by a palettizable approximation of it,
// and generates optimal enc->palette_[].
// This function can revert enc->use_palette_ / enc->use_predict_ flag
// if delta-palettization is not producing expected saving.
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
#endif // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_

@ -198,7 +198,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
const VP8MBInfo* const mb = &enc->mb_info_[n];
p[mb->segment_]++;
++p[mb->segment_];
}
#if !defined(WEBP_DISABLE_STATS)
if (enc->pic_->stats != NULL) {
@ -520,6 +520,14 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
#endif
}
static void ResetSideInfo(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
WebPPicture* const pic = enc->pic_;
if (pic->stats != NULL) {
memset(enc->block_count_, 0, sizeof(enc->block_count_));
}
ResetSSE(enc);
}
#else // defined(WEBP_DISABLE_STATS)
static void ResetSSE(VP8Encoder* const enc) {
(void)enc;
@ -528,10 +536,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
WebPPicture* const pic = enc->pic_;
if (pic->extra_info != NULL) {
memset(pic->extra_info, 0,
enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
if (it->x_ == 0 && it->y_ == 0) { // only do it once, at start
memset(pic->extra_info, 0,
enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
}
}
}
static void ResetSideInfo(const VP8EncIterator* const it) {
(void)it;
}
#endif // !defined(WEBP_DISABLE_STATS)
static double GetPSNR(uint64_t mse, uint64_t size) {
@ -570,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
VP8IteratorImport(&it, NULL);
if (VP8Decimate(&it, &info, rd_opt)) {
// Just record the number of skips and act like skip_proba is not used.
enc->proba_.nb_skip_++;
++enc->proba_.nb_skip_;
}
RecordResiduals(&it, &info);
size += info.R + info.H;
@ -841,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
++num_pass_left;
enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation...
if (is_last_pass) {
ResetSideInfo(&it);
}
continue; // ...and start over
}
if (is_last_pass) {
@ -871,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
#endif // DISABLE_TOKEN_BUFFER
//------------------------------------------------------------------------------

@ -200,14 +200,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
}
}
double VP8LBitsEntropy(const uint32_t* const array, int n,
uint32_t* const trivial_symbol) {
double VP8LBitsEntropy(const uint32_t* const array, int n) {
VP8LBitEntropy entropy;
VP8LBitsEntropyUnrefined(array, n, &entropy);
if (trivial_symbol != NULL) {
*trivial_symbol =
(entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
}
return BitsEntropyRefine(&entropy);
}
@ -1031,7 +1026,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
}
}
// TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
// TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
// Find the optimal map from original histograms to the final ones.
HistogramRemap(orig_histo, image_histo, histogram_symbols);

@ -109,10 +109,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
uint16_t* const histogram_symbols);
// Returns the entropy for the symbols in the input array.
// Also sets trivial_symbol to the code value, if the array has only one code
// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
double VP8LBitsEntropy(const uint32_t* const array, int n,
uint32_t* const trivial_symbol);
double VP8LBitsEntropy(const uint32_t* const array, int n);
// Estimate how many bits the combined entropy of literals and distance
// approximately maps to.

@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
memset(it->u_left_, 129, 8);
memset(it->v_left_, 129, 8);
it->left_nz_[8] = 0;
if (it->top_derr_ != NULL) {
memset(&it->left_derr_, 0, sizeof(it->left_derr_));
}
}
static void InitTop(VP8EncIterator* const it) {
@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
const size_t top_size = enc->mb_w_ * 16;
memset(enc->y_top_, 127, 2 * top_size);
memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
if (enc->top_derr_ != NULL) {
memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
}
}
void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
it->u_left_ = it->y_left_ + 16 + 16;
it->v_left_ = it->u_left_ + 16;
it->top_derr_ = enc->top_derr_;
VP8IteratorReset(it);
}
@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
}
//------------------------------------------------------------------------------

@ -146,6 +146,6 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
// Define a stub to suppress compiler warnings.
extern void VP8LNearLosslessStub(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
void VP8LNearLosslessStub(void) {}
#endif // (WEBP_NEAR_LOSSLESS == 1)

@ -28,11 +28,11 @@
// If defined, use table to compute x / alpha.
#define USE_INVERSE_ALPHA_TABLE
static const union {
uint32_t argb;
uint8_t bytes[4];
} test_endian = { 0xff000000u };
#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
#ifdef WORDS_BIGENDIAN
#define ALPHA_OFFSET 0 // uint32_t 0xff000000 is 0xff,00,00,00 in memory
#else
#define ALPHA_OFFSET 3 // uint32_t 0xff000000 is 0x00,00,00,ff in memory
#endif
//------------------------------------------------------------------------------
// Detection of non-trivial transparency
@ -61,7 +61,7 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
return CheckNonOpaque(picture->a, picture->width, picture->height,
1, picture->a_stride);
} else {
const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
const int alpha_offset = ALPHA_OFFSET;
return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
picture->width, picture->height,
4, picture->argb_stride * sizeof(*picture->argb));
@ -126,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
#else
static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
static void InitGammaTables(void) {}
static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
return (int)(base_value << shift);
@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
#if defined(USE_GAMMA_COMPRESSION)
// float variant of gamma-correction
// We use tables of different size and precision for the Rec709 / BT2020
// transfer function.
#define kGammaF (1./0.45)
static float kGammaToLinearTabF[MAX_Y_T + 1]; // size scales with Y_FIX
static float kLinearToGammaTabF[kGammaTabSize + 2];
static volatile int kGammaTablesFOk = 0;
static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
if (!kGammaTablesFOk) {
static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
#define GAMMA_TO_LINEAR_BITS 14
static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX
static volatile int kGammaTablesSOk = 0;
static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
if (!kGammaTablesSOk) {
int v;
const double norm = 1. / MAX_Y_T;
const double scale = 1. / kGammaTabSize;
const double a = 0.09929682680944;
const double thresh = 0.018053968510807;
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
for (v = 0; v <= MAX_Y_T; ++v) {
const double g = norm * v;
double value;
if (g <= thresh * 4.5) {
kGammaToLinearTabF[v] = (float)(g / 4.5);
value = g / 4.5;
} else {
const double a_rec = 1. / (1. + a);
kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
value = pow(a_rec * (g + a), kGammaF);
}
kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
}
for (v = 0; v <= kGammaTabSize; ++v) {
const double g = scale * v;
@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
} else {
value = (1. + a) * pow(g, 1. / kGammaF) - a;
}
kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
// we already incorporate the 1/2 rounding constant here
kLinearToGammaTabS[v] =
(uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
}
// to prevent small rounding errors to cause read-overflow:
kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
kGammaTablesFOk = 1;
kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
kGammaTablesSOk = 1;
}
}
static WEBP_INLINE float GammaToLinearF(int v) {
return kGammaToLinearTabF[v];
// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
static WEBP_INLINE uint32_t GammaToLinearS(int v) {
return kGammaToLinearTabS[v];
}
static WEBP_INLINE int LinearToGammaF(float value) {
const float v = value * kGammaTabSize;
const int tab_pos = (int)v;
const float x = v - (float)tab_pos; // fractional part
const float v0 = kLinearToGammaTabF[tab_pos + 0];
const float v1 = kLinearToGammaTabF[tab_pos + 1];
const float y = v1 * x + v0 * (1.f - x); // interpolate
return (int)(y + .5);
static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
// 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
const uint32_t v = value * kGammaTabSize;
const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
// fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
// v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
// Final interpolation. Note that rounding is already included.
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
return result;
}
#else
static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
static WEBP_INLINE float GammaToLinearF(int v) {
const float norm = 1.f / MAX_Y_T;
return norm * v;
static void InitGammaTablesS(void) {}
static WEBP_INLINE uint32_t GammaToLinearS(int v) {
return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
}
static WEBP_INLINE int LinearToGammaF(float value) {
return (int)(MAX_Y_T * value + .5);
static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
}
#endif // USE_GAMMA_COMPRESSION
@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) {
return (luma >> YUV_FIX);
}
static float RGBToGrayF(float r, float g, float b) {
return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
}
static int ScaleDown(int a, int b, int c, int d) {
const float A = GammaToLinearF(a);
const float B = GammaToLinearF(b);
const float C = GammaToLinearF(c);
const float D = GammaToLinearF(d);
return LinearToGammaF(0.25f * (A + B + C + D));
static uint32_t ScaleDown(int a, int b, int c, int d) {
const uint32_t A = GammaToLinearS(a);
const uint32_t B = GammaToLinearS(b);
const uint32_t C = GammaToLinearS(c);
const uint32_t D = GammaToLinearS(d);
return LinearToGammaS((A + B + C + D + 2) >> 2);
}
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
int i;
for (i = 0; i < w; ++i) {
const float R = GammaToLinearF(src[0 * w + i]);
const float G = GammaToLinearF(src[1 * w + i]);
const float B = GammaToLinearF(src[2 * w + i]);
const float Y = RGBToGrayF(R, G, B);
dst[i] = (fixed_y_t)LinearToGammaF(Y);
const uint32_t R = GammaToLinearS(src[0 * w + i]);
const uint32_t G = GammaToLinearS(src[1 * w + i]);
const uint32_t B = GammaToLinearS(src[2 * w + i]);
const uint32_t Y = RGBToGray(R, G, B);
dst[i] = (fixed_y_t)LinearToGammaS(Y);
}
}
@ -863,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
}
if (use_iterative_conversion) {
InitGammaTablesF();
InitGammaTablesS();
if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
return 0;
}
@ -990,10 +997,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
} else {
const uint8_t* const argb = (const uint8_t*)picture->argb;
const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET);
const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET);
const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET);
const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET);
picture->colorspace = WEBP_YUV420;
return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
@ -1044,7 +1051,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
const int argb_stride = 4 * picture->argb_stride;
uint8_t* dst = (uint8_t*)picture->argb;
const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
WebPUpsampleLinePairFunc upsample =
WebPGetLinePairConverter(ALPHA_OFFSET > 0);
// First row, with replicated top samples.
upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
@ -1087,6 +1095,7 @@ static int Import(WebPPicture* const picture,
const uint8_t* rgb, int rgb_stride,
int step, int swap_rb, int import_alpha) {
int y;
// swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
const uint8_t* g_ptr = rgb + 1;
const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
@ -1104,19 +1113,32 @@ static int Import(WebPPicture* const picture,
WebPInitAlphaProcessing();
if (import_alpha) {
// dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
uint32_t* dst = picture->argb;
const int do_copy =
(!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
assert(step == 4);
for (y = 0; y < height; ++y) {
if (do_copy) {
if (do_copy) {
for (y = 0; y < height; ++y) {
memcpy(dst, rgb, width * 4);
} else {
rgb += rgb_stride;
dst += picture->argb_stride;
}
} else {
for (y = 0; y < height; ++y) {
#ifdef WORDS_BIGENDIAN
// BGRA or RGBA input order.
const uint8_t* a_ptr = rgb + 3;
WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
r_ptr += rgb_stride;
g_ptr += rgb_stride;
b_ptr += rgb_stride;
#else
// RGBA input order. Need to swap R and B.
VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
#endif
rgb += rgb_stride;
dst += picture->argb_stride;
}
rgb += rgb_stride;
dst += picture->argb_stride;
}
} else {
uint32_t* dst = picture->argb;

@ -18,6 +18,7 @@
#include <math.h>
#include <stdlib.h>
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
@ -169,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
return 1;
}
#ifdef WORDS_BIGENDIAN
#define BLUE_OFFSET 3 // uint32_t 0x000000ff is 0x00,00,00,ff in memory
#else
#define BLUE_OFFSET 0 // uint32_t 0x000000ff is 0xff,00,00,00 in memory
#endif
int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
int type, float results[5]) {
int w, h, c;
@ -195,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
float distortion;
const size_t stride0 = 4 * (size_t)p0.argb_stride;
const size_t stride1 = 4 * (size_t)p1.argb_stride;
if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
(const uint8_t*)p1.argb + c, stride1,
// results are reported as BGRA
const int offset = c ^ BLUE_OFFSET;
if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
(const uint8_t*)p1.argb + offset, stride1,
w, h, 4, type, &distortion, results + c)) {
goto Error;
}
@ -214,6 +223,8 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
return ok;
}
#undef BLUE_OFFSET
#else // defined(WEBP_DISABLE_STATS)
int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
const uint8_t* ref, size_t ref_stride,

@ -826,6 +826,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
return nz;
}
//------------------------------------------------------------------------------
// DC-error diffusion
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
#define C1 7 // fraction of error sent to the 4x4 block below
#define C2 8 // fraction of error sent to the 4x4 block on the right
#define DSHIFT 4
#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
// Quantize as usual, but also compute and return the quantization error.
// Error is already divided by DSHIFT.
static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
int V = *v;
const int sign = (V < 0);
if (sign) V = -V;
if (V > (int)mtx->zthresh_[0]) {
const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
const int err = (V - qV);
*v = sign ? -qV : qV;
return (sign ? -err : err) >> DSCALE;
}
*v = 0;
return (sign ? -V : V) >> DSCALE;
}
static void CorrectDCValues(const VP8EncIterator* const it,
const VP8Matrix* const mtx,
int16_t tmp[][16], VP8ModeScore* const rd) {
// | top[0] | top[1]
// --------+--------+---------
// left[0] | tmp[0] tmp[1] <-> err0 err1
// left[1] | tmp[2] tmp[3] err2 err3
//
// Final errors {err1,err2,err3} are preserved and later restored
// as top[]/left[] on the next block.
int ch;
for (ch = 0; ch <= 1; ++ch) {
const int8_t* const top = it->top_derr_[it->x_][ch];
const int8_t* const left = it->left_derr_[ch];
int16_t (* const c)[16] = &tmp[ch * 4];
int err0, err1, err2, err3;
c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
err0 = QuantizeSingle(&c[0][0], mtx);
c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
err1 = QuantizeSingle(&c[1][0], mtx);
c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
err2 = QuantizeSingle(&c[2][0], mtx);
c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
err3 = QuantizeSingle(&c[3][0], mtx);
// error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
rd->derr[ch][0] = (int8_t)err1;
rd->derr[ch][1] = (int8_t)err2;
rd->derr[ch][2] = (int8_t)err3;
}
}
static void StoreDiffusionErrors(VP8EncIterator* const it,
const VP8ModeScore* const rd) {
int ch;
for (ch = 0; ch <= 1; ++ch) {
int8_t* const top = it->top_derr_[it->x_][ch];
int8_t* const left = it->left_derr_[ch];
left[0] = rd->derr[ch][0]; // restore err1
left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
top[0] = rd->derr[ch][1]; // ... err2
top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
}
}
#undef C1
#undef C2
#undef DSHIFT
#undef DSCALE
//------------------------------------------------------------------------------
static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
uint8_t* const yuv_out, int mode) {
const VP8Encoder* const enc = it->enc_;
@ -839,6 +918,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
for (n = 0; n < 8; n += 2) {
VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
}
if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
if (DO_TRELLIS_UV && it->do_trellis_) {
int ch, x, y;
for (ch = 0, n = 0; ch <= 2; ch += 2) {
@ -1101,6 +1182,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
CopyScore(&rd_best, &rd_uv);
rd->mode_uv = mode;
memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
if (it->top_derr_ != NULL) {
memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
}
SwapPtr(&dst, &tmp_dst);
}
}
@ -1109,6 +1193,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
if (dst != dst0) { // copy 16x8 block if needed
VP8Copy16x8(dst, dst0);
}
if (it->top_derr_ != NULL) { // store diffusion errors for next block
StoreDiffusionErrors(it, rd);
}
}
//------------------------------------------------------------------------------

@ -30,9 +30,9 @@ extern "C" {
// Various defines and enums
// version numbers
#define ENC_MAJ_VERSION 0
#define ENC_MIN_VERSION 6
#define ENC_REV_VERSION 1
#define ENC_MAJ_VERSION 1
#define ENC_MIN_VERSION 0
#define ENC_REV_VERSION 0
enum { MAX_LF_LEVELS = 64, // Maximum loop filter level
MAX_VARIABLE_LEVEL = 67, // last (inclusive) level with variable cost
@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
// Uncomment the following to remove token-buffer code:
// #define DISABLE_TOKEN_BUFFER
// quality below which error-diffusion is enabled
#define ERROR_DIFFUSION_QUALITY 98
//------------------------------------------------------------------------------
// Headers
@ -201,6 +204,8 @@ typedef struct {
score_t i4_penalty_; // penalty for using Intra4
} VP8SegmentInfo;
typedef int8_t DError[2 /* u/v */][2 /* top or left */];
// Handy transient struct to accumulate score and info during RD-optimization
// and mode evaluation.
typedef struct {
@ -213,6 +218,7 @@ typedef struct {
uint8_t modes_i4[16]; // mode numbers for intra4 predictions
int mode_uv; // mode number of chroma prediction
uint32_t nz; // non-zero blocks
int8_t derr[2][3]; // DC diffusion errors for U/V for blocks #1/2/3
} VP8ModeScore;
// Iterator structure to iterate through macroblocks, pointing to the
@ -242,6 +248,9 @@ typedef struct {
int count_down0_; // starting counter value (for progress)
int percent0_; // saved initial progress percent
DError left_derr_; // left error diffusion (u/v)
DError *top_derr_; // top diffusion error - NULL if disabled
uint8_t* y_left_; // left luma samples (addressable from index -1 to 15).
uint8_t* u_left_; // left u samples (addressable from index -1 to 7)
uint8_t* v_left_; // left v samples (addressable from index -1 to 7)
@ -401,6 +410,7 @@ struct VP8Encoder {
uint8_t* uv_top_; // top u/v samples.
// U and V are packed into 16 bytes (8 U + 8 V)
LFStats* lf_stats_; // autofilter stats (if NULL, autofilter is off)
DError* top_derr_; // diffusion error (NULL if disabled)
};
//------------------------------------------------------------------------------

@ -26,8 +26,6 @@
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
#include "src/enc/delta_palettization_enc.h"
// Maximum number of histogram images (sub-blocks).
#define MAX_HUFF_IMAGE_SIZE 2600
@ -259,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
++histo[kHistoAlphaPred * 256];
for (j = 0; j < kHistoTotal; ++j) {
entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
}
entropy[kDirect] = entropy_comp[kHistoAlpha] +
entropy_comp[kHistoRed] +
@ -384,8 +382,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
AnalyzeAndCreatePalette(pic, low_effort,
enc->palette_, &enc->palette_size_);
// TODO(jyrki): replace the decision to be based on an actual estimate
// of entropy, or even spatial variance of entropy.
// Empirical bit sizes.
enc->histo_bits_ = GetHistoBits(method, use_palette,
pic->width, pic->height);
enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
@ -756,7 +753,6 @@ static WebPEncodingError StoreImageToBitMask(
// Don't write the distance with the extra bits code since
// the distance can be up to 18 bits of extra bits, and the prefix
// 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
// TODO(jyrki): optimize this further.
VP8LPrefixEncode(distance, &code, &n_bits, &bits);
WriteHuffmanCode(bw, codes + 4, code);
VP8LPutBits(bw, bits, n_bits);
@ -1464,49 +1460,6 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
20 /* quality */, low_effort);
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
static WebPEncodingError EncodeDeltaPalettePredictorImage(
VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
int low_effort) {
const WebPPicture* const pic = enc->pic_;
const int width = pic->width;
const int height = pic->height;
const int pred_bits = 5;
const int transform_width = VP8LSubSampleSize(width, pred_bits);
const int transform_height = VP8LSubSampleSize(height, pred_bits);
const int pred = 7; // default is Predictor7 (Top/Left Average)
const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
uint32_t* predictors;
int tile_x, tile_y;
WebPEncodingError err = VP8_ENC_OK;
predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
sizeof(*predictors));
if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
}
}
VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
VP8LPutBits(bw, pred_bits - 2, 3);
err = EncodeImageNoHuffman(
bw, predictors, &enc->hash_chain_,
(VP8LBackwardRefs*)&enc->refs_[0], // cast const away
(VP8LBackwardRefs*)&enc->refs_[1],
transform_width, transform_height, quality, low_effort);
WebPSafeFree(predictors);
return err;
}
#endif // WEBP_EXPERIMENTAL_FEATURES
// -----------------------------------------------------------------------------
// VP8LEncoder
@ -1568,7 +1521,7 @@ static int EncodeStreamHook(void* input, void* data2) {
WebPEncodingError err = VP8_ENC_OK;
const int quality = (int)config->quality;
const int low_effort = (config->method == 0);
#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
#if (WEBP_NEAR_LOSSLESS == 1)
const int width = picture->width;
#endif
const int height = picture->height;
@ -1627,29 +1580,6 @@ static int EncodeStreamHook(void* input, void* data2) {
enc->argb_content_ = kEncoderNone;
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (config->use_delta_palette) {
enc->use_predict_ = 1;
enc->use_cross_color_ = 0;
enc->use_subtract_green_ = 0;
enc->use_palette_ = 1;
if (enc->argb_content_ != kEncoderNearLossless &&
enc->argb_content_ != kEncoderPalette) {
err = MakeInputImageCopy(enc);
if (err != VP8_ENC_OK) goto Error;
}
err = WebPSearchOptimalDeltaPalette(enc);
if (err != VP8_ENC_OK) goto Error;
if (enc->use_palette_) {
err = AllocateTransformBuffer(enc, width, height);
if (err != VP8_ENC_OK) goto Error;
err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
if (err != VP8_ENC_OK) goto Error;
use_delta_palette = 1;
}
}
#endif // WEBP_EXPERIMENTAL_FEATURES
// Encode palette
if (enc->use_palette_) {
err = EncodePalette(bw, low_effort, enc);
@ -1822,7 +1752,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
worker_interface->Init(worker);
worker->data1 = param;
worker->data2 = NULL;
worker->hook = (WebPWorkerHook)EncodeStreamHook;
worker->hook = EncodeStreamHook;
}
}
@ -1944,7 +1874,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
if (err != VP8_ENC_OK) goto Error;
// TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
// Finish the RIFF chunk.

@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+ WEBP_ALIGN_CST; // align all
const size_t lf_stats_size =
config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
const size_t top_derr_size =
(config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
mb_w * sizeof(*enc->top_derr_) : 0;
uint8_t* mem;
const uint64_t size = (uint64_t)sizeof(*enc) // main struct
+ WEBP_ALIGN_CST // cache alignment
+ info_size // modes info
+ preds_size // prediction modes
+ samples_size // top/left samples
+ top_derr_size // top diffusion error
+ nz_size // coeff context bits
+ lf_stats_size; // autofilter stats
@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
" info: %ld\n"
" preds: %ld\n"
" top samples: %ld\n"
" top diffusion: %ld\n"
" non-zero: %ld\n"
" lf-stats: %ld\n"
" total: %ld\n",
sizeof(*enc) + WEBP_ALIGN_CST, info_size,
preds_size, samples_size, nz_size, lf_stats_size, size);
preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
printf("Transient object sizes:\n"
" VP8EncIterator: %ld\n"
" VP8ModeScore: %ld\n"
@ -219,6 +224,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
enc->y_top_ = mem;
enc->uv_top_ = enc->y_top_ + top_stride;
mem += 2 * top_stride;
enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
mem += top_derr_size;
assert(mem <= (uint8_t*)enc + size);
enc->config_ = config;

@ -26,9 +26,9 @@ extern "C" {
//------------------------------------------------------------------------------
// Defines and constants.
#define MUX_MAJ_VERSION 0
#define MUX_MIN_VERSION 4
#define MUX_REV_VERSION 1
#define MUX_MAJ_VERSION 1
#define MUX_MIN_VERSION 0
#define MUX_REV_VERSION 0
// Chunk object.
typedef struct WebPChunk WebPChunk;

@ -19,13 +19,6 @@
#include "src/dsp/dsp.h"
#include "src/webp/types.h"
// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
#if !defined(WORDS_BIGENDIAN) && \
(defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
(defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
#define WORDS_BIGENDIAN
#endif
#if defined(WORDS_BIGENDIAN)
#define HToLE32 BSwap32
#define HToLE16 BSwap16

@ -3,9 +3,22 @@ add_definitions(-D__OPENCV_APPS=1)
link_libraries(${OPENCV_LINKER_LIBS})
add_subdirectory(traincascade)
add_subdirectory(createsamples)
add_subdirectory(annotation)
add_subdirectory(visualisation)
add_subdirectory(interactive-calibration)
add_subdirectory(version)
macro(ocv_add_app directory)
if(DEFINED BUILD_APPS_LIST)
list(FIND BUILD_APPS_LIST ${directory} _index)
if (${_index} GREATER -1)
add_subdirectory(${directory})
else()
message(STATUS "Skip OpenCV app: ${directory}")
endif()
else()
add_subdirectory(${directory})
endif()
endmacro()
ocv_add_app(traincascade)
ocv_add_app(createsamples)
ocv_add_app(annotation)
ocv_add_app(visualisation)
ocv_add_app(interactive-calibration)
ocv_add_app(version)

@ -217,7 +217,7 @@ int main(int argc, char** argv)
(*it)->resetState();
}
}
catch (std::runtime_error exp) {
catch (const std::runtime_error& exp) {
std::cout << exp.what() << std::endl;
}

@ -1,19 +1,13 @@
SET(OPENCV_APPLICATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio)
set(OPENCV_APPLICATION_DEPS opencv_core)
ocv_check_dependencies(${OPENCV_APPLICATION_DEPS})
if(NOT OCV_DEPENDENCIES_FOUND)
return()
endif()
project(opencv_version)
set(the_target opencv_version)
ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
file(GLOB SRCS *.cpp)
ocv_add_executable(${the_target} ${SRCS})
ocv_add_executable(${the_target} opencv_version.cpp)
ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})
set_target_properties(${the_target} PROPERTIES
@ -30,3 +24,26 @@ if(INSTALL_CREATE_DISTRIB)
else()
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
endif()
if(WIN32)
project(opencv_version_win32)
set(the_target opencv_version_win32)
ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS})
ocv_add_executable(${the_target} opencv_version.cpp)
ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS})
target_compile_definitions(${the_target} PRIVATE "OPENCV_WIN32_API=1")
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
OUTPUT_NAME "opencv_version_win32")
set_target_properties(${the_target} PROPERTIES FOLDER "applications")
if(INSTALL_CREATE_DISTRIB)
if(BUILD_SHARED_LIBS)
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs)
endif()
else()
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs)
endif()
endif()

@ -9,6 +9,31 @@
#include <opencv2/core/opencl/opencl_info.hpp>
#ifdef OPENCV_WIN32_API
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
static void dumpHWFeatures(bool showAll = false)
{
std::cout << "OpenCV's HW features list:" << std::endl;
int count = 0;
for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
{
cv::String name = cv::getHardwareFeatureName(i);
if (name.empty())
continue;
bool enabled = cv::checkHardwareSupport(i);
if (enabled)
count++;
if (enabled || showAll)
{
printf(" ID=%3d (%s) -> %s\n", i, name.c_str(), enabled ? "ON" : "N/A");
}
}
std::cout << "Total available: " << count << std::endl;
}
int main(int argc, const char** argv)
{
CV_TRACE_FUNCTION();
@ -16,6 +41,7 @@ int main(int argc, const char** argv)
CV_TRACE_ARG_VALUE(argv0, "argv0", argv[0]);
CV_TRACE_ARG_VALUE(argv1, "argv1", argv[1]);
#ifndef OPENCV_WIN32_API
cv::CommandLineParser parser(argc, argv,
"{ help h usage ? | | show this help message }"
"{ verbose v | | show build configuration log }"
@ -45,24 +71,14 @@ int main(int argc, const char** argv)
if (parser.has("hw"))
{
bool showAll = parser.get<bool>("hw");
std::cout << "OpenCV's HW features list:" << std::endl;
int count = 0;
for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
{
cv::String name = cv::getHardwareFeatureName(i);
if (name.empty())
continue;
bool enabled = cv::checkHardwareSupport(i);
if (enabled)
count++;
if (enabled || showAll)
{
printf(" ID=%3d (%s) -> %s\n", i, name.c_str(), enabled ? "ON" : "N/A");
}
}
std::cout << "Total available: " << count << std::endl;
dumpHWFeatures(parser.get<bool>("hw"));
}
#else
std::cout << cv::getBuildInformation().c_str() << std::endl;
cv::dumpOpenCLInformation();
dumpHWFeatures();
MessageBoxA(NULL, "Check console window output", "OpenCV(" CV_VERSION ")", MB_ICONINFORMATION | MB_OK);
#endif
return 0;
}

@ -27,6 +27,12 @@ function(find_python preferred_version min_version library_env include_dir_env
debug_library include_path include_dir include_dir2 packages_path
numpy_include_dirs numpy_version)
if(NOT ${found})
if(" ${executable}" STREQUAL " PYTHON_EXECUTABLE")
set(__update_python_vars 0)
else()
set(__update_python_vars 1)
endif()
ocv_check_environment_variables(${executable})
if(${executable})
set(PYTHON_EXECUTABLE "${${executable}}")
@ -47,7 +53,7 @@ if(NOT ${found})
endforeach()
endif()
string(REGEX MATCH "^[0-9]+" _preferred_version_major ${preferred_version})
string(REGEX MATCH "^[0-9]+" _preferred_version_major "${preferred_version}")
find_host_package(PythonInterp "${preferred_version}")
if(NOT PYTHONINTERP_FOUND)
@ -56,7 +62,7 @@ if(NOT ${found})
if(PYTHONINTERP_FOUND)
# Check if python major version is correct
if(${_preferred_version_major} EQUAL ${PYTHON_VERSION_MAJOR})
if("${_preferred_version_major}" STREQUAL "" OR "${_preferred_version_major}" STREQUAL "${PYTHON_VERSION_MAJOR}")
# Copy outputs
set(_found ${PYTHONINTERP_FOUND})
set(_executable ${PYTHON_EXECUTABLE})
@ -65,7 +71,9 @@ if(NOT ${found})
set(_version_minor ${PYTHON_VERSION_MINOR})
set(_version_patch ${PYTHON_VERSION_PATCH})
endif()
endif()
if(__update_python_vars)
# Clear find_host_package side effects
unset(PYTHONINTERP_FOUND)
unset(PYTHON_EXECUTABLE CACHE)
@ -109,7 +117,8 @@ if(NOT ${found})
set(_library_release ${PYTHON_LIBRARY_RELEASE})
set(_include_dir ${PYTHON_INCLUDE_DIR})
set(_include_dir2 ${PYTHON_INCLUDE_DIR2})
endif()
if(__update_python_vars)
# Clear find_package side effects
unset(PYTHONLIBS_FOUND)
unset(PYTHON_LIBRARIES)
@ -160,7 +169,7 @@ if(NOT ${found})
unset(_path)
endif()
set(_numpy_include_dirs ${${numpy_include_dirs}})
set(_numpy_include_dirs "${${numpy_include_dirs}}")
if(NOT _numpy_include_dirs)
if(CMAKE_CROSSCOMPILING)
@ -222,6 +231,10 @@ if(NOT ${found})
endif()
endfunction(find_python)
if(OPENCV_PYTHON_SKIP_DETECTION)
return()
endif()
find_python(2.7 "${MIN_VER_PYTHON2}" PYTHON2_LIBRARY PYTHON2_INCLUDE_DIR
PYTHON2INTERP_FOUND PYTHON2_EXECUTABLE PYTHON2_VERSION_STRING
PYTHON2_VERSION_MAJOR PYTHON2_VERSION_MINOR PYTHON2LIBS_FOUND

@ -4,32 +4,34 @@ Camera Calibration {#tutorial_py_calibration}
Goal
----
In this section,
- We will learn about distortions in camera, intrinsic and extrinsic parameters of camera etc.
- We will learn to find these parameters, undistort images etc.
In this section, we will learn about
* types of distortion caused by cameras
* how to find the intrinsic and extrinsic properties of a camera
* how to undistort images based off these properties
Basics
------
Today's cheap pinhole cameras introduces a lot of distortion to images. Two major distortions are
Some pinhole cameras introduce significant distortion to images. Two major kinds of distortion are
radial distortion and tangential distortion.
Due to radial distortion, straight lines will appear curved. Its effect is more as we move away from
the center of image. For example, one image is shown below, where two edges of a chess board are
marked with red lines. But you can see that border is not a straight line and doesn't match with the
Radial distortion causes straight lines to appear curved. Radial distortion becomes larger the farther points are from
the center of the image. For example, one image is shown below in which two edges of a chess board are
marked with red lines. But, you can see that the border of the chess board is not a straight line and doesn't match with the
red line. All the expected straight lines are bulged out. Visit [Distortion
(optics)](http://en.wikipedia.org/wiki/Distortion_%28optics%29) for more details.
![image](images/calib_radial.jpg)
This distortion is represented as follows:
Radial distortion can be represented as follows:
\f[x_{distorted} = x( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6) \\
y_{distorted} = y( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6)\f]
Similarly, another distortion is the tangential distortion which occurs because image taking lense
is not aligned perfectly parallel to the imaging plane. So some areas in image may look nearer than
expected. It is represented as below:
Similarly, tangential distortion occurs because the image-taking lense
is not aligned perfectly parallel to the imaging plane. So, some areas in the image may look nearer than
expected. The amount of tangential distortion can be represented as below:
\f[x_{distorted} = x + [ 2p_1xy + p_2(r^2+2x^2)] \\
y_{distorted} = y + [ p_1(r^2+ 2y^2)+ 2p_2xy]\f]
@ -38,10 +40,9 @@ In short, we need to find five parameters, known as distortion coefficients give
\f[Distortion \; coefficients=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]
In addition to this, we need to find a few more information, like intrinsic and extrinsic parameters
of a camera. Intrinsic parameters are specific to a camera. It includes information like focal
length (\f$f_x,f_y\f$), optical centers (\f$c_x, c_y\f$) etc. It is also called camera matrix. It depends on
the camera only, so once calculated, it can be stored for future purposes. It is expressed as a 3x3
In addition to this, we need to some other information, like the intrinsic and extrinsic parameters
of the camera. Intrinsic parameters are specific to a camera. They include information like focal
length (\f$f_x,f_y\f$) and optical centers (\f$c_x, c_y\f$). The focal length and optical centers can be used to create a camera matrix, which can be used to remove distortion due to the lenses of a specific camera. The camera matrix is unique to a specific camera, so once calculated, it can be reused on other images taken by the same camera. It is expressed as a 3x3
matrix:
\f[camera \; matrix = \left [ \begin{matrix} f_x & 0 & c_x \\ 0 & f_y & c_y \\ 0 & 0 & 1 \end{matrix} \right ]\f]
@ -49,20 +50,16 @@ matrix:
Extrinsic parameters corresponds to rotation and translation vectors which translates a coordinates
of a 3D point to a coordinate system.
For stereo applications, these distortions need to be corrected first. To find all these parameters,
what we have to do is to provide some sample images of a well defined pattern (eg, chess board). We
find some specific points in it ( square corners in chess board). We know its coordinates in real
world space and we know its coordinates in image. With these data, some mathematical problem is
solved in background to get the distortion coefficients. That is the summary of the whole story. For
better results, we need atleast 10 test patterns.
For stereo applications, these distortions need to be corrected first. To find these parameters,
we must provide some sample images of a well defined pattern (e.g. a chess board). We
find some specific points of which we already know the relative positions (e.g. square corners in the chess board). We know the coordinates of these points in real world space and we know the coordinates in the image, so we can solve for the distortion coefficients. For better results, we need at least 10 test patterns.
Code
----
As mentioned above, we need atleast 10 test patterns for camera calibration. OpenCV comes with some
images of chess board (see samples/cpp/left01.jpg -- left14.jpg), so we will utilize it. For sake of
understanding, consider just one image of a chess board. Important input datas needed for camera
calibration is a set of 3D real world points and its corresponding 2D image points. 2D image points
As mentioned above, we need at least 10 test patterns for camera calibration. OpenCV comes with some
images of a chess board (see samples/data/left01.jpg -- left14.jpg), so we will utilize these. Consider an image of a chess board. The important input data needed for calibration of the camera
is the set of 3D real world points and the corresponding 2D coordinates of these points in the image. 2D image points
are OK which we can easily find from the image. (These image points are locations where two black
squares touch each other in chess boards)
@ -72,7 +69,7 @@ values. But for simplicity, we can say chess board was kept stationary at XY pla
and camera was moved accordingly. This consideration helps us to find only X,Y values. Now for X,Y
values, we can simply pass the points as (0,0), (1,0), (2,0), ... which denotes the location of
points. In this case, the results we get will be in the scale of size of chess board square. But if
we know the square size, (say 30 mm), and we can pass the values as (0,0),(30,0),(60,0),..., we get
we know the square size, (say 30 mm), we can pass the values as (0,0), (30,0), (60,0), ... . Thus, we get
the results in mm. (In this case, we don't know square size since we didn't take those images, so we
pass in terms of square size).
@ -80,23 +77,22 @@ pass in terms of square size).
### Setup
So to find pattern in chess board, we use the function, **cv.findChessboardCorners()**. We also
need to pass what kind of pattern we are looking, like 8x8 grid, 5x5 grid etc. In this example, we
So to find pattern in chess board, we can use the function, **cv.findChessboardCorners()**. We also
need to pass what kind of pattern we are looking for, like 8x8 grid, 5x5 grid etc. In this example, we
use 7x6 grid. (Normally a chess board has 8x8 squares and 7x7 internal corners). It returns the
corner points and retval which will be True if pattern is obtained. These corners will be placed in
an order (from left-to-right, top-to-bottom)
@sa This function may not be able to find the required pattern in all the images. So one good option
@sa This function may not be able to find the required pattern in all the images. So, one good option
is to write the code such that, it starts the camera and check each frame for required pattern. Once
pattern is obtained, find the corners and store it in a list. Also provides some interval before
the pattern is obtained, find the corners and store it in a list. Also, provide some interval before
reading next frame so that we can adjust our chess board in different direction. Continue this
process until required number of good patterns are obtained. Even in the example provided here, we
are not sure out of 14 images given, how many are good. So we read all the images and take the good
process until the required number of good patterns are obtained. Even in the example provided here, we
are not sure how many images out of the 14 given are good. Thus, we must read all the images and take only the good
ones.
@sa Instead of chess board, we can use some circular grid, but then use the function
**cv.findCirclesGrid()** to find the pattern. It is said that less number of images are enough when
using circular grid.
@sa Instead of chess board, we can alternatively use a circular grid. In this case, we must use the function
**cv.findCirclesGrid()** to find the pattern. Fewer images are sufficient to perform camera calibration using a circular grid.
Once we find the corners, we can increase their accuracy using **cv.cornerSubPix()**. We can also
draw the pattern using **cv.drawChessboardCorners()**. All these steps are included in below code:
@ -146,22 +142,23 @@ One image with pattern drawn on it is shown below:
### Calibration
So now we have our object points and image points we are ready to go for calibration. For that we
use the function, **cv.calibrateCamera()**. It returns the camera matrix, distortion coefficients,
Now that we have our object points and image points, we are ready to go for calibration. We can
use the function, **cv.calibrateCamera()** which returns the camera matrix, distortion coefficients,
rotation and translation vectors etc.
@code{.py}
ret, mtx, dist, rvecs, tvecs = cv.calibrateCamera(objpoints, imgpoints, gray.shape[::-1], None, None)
@endcode
### Undistortion
We have got what we were trying. Now we can take an image and undistort it. OpenCV comes with two
methods, we will see both. But before that, we can refine the camera matrix based on a free scaling
Now, we can take an image and undistort it. OpenCV comes with two
methods for doing this. However first, we can refine the camera matrix based on a free scaling
parameter using **cv.getOptimalNewCameraMatrix()**. If the scaling parameter alpha=0, it returns
undistorted image with minimum unwanted pixels. So it may even remove some pixels at image corners.
If alpha=1, all pixels are retained with some extra black images. It also returns an image ROI which
If alpha=1, all pixels are retained with some extra black images. This function also returns an image ROI which
can be used to crop the result.
So we take a new image (left12.jpg in this case. That is the first image in this chapter)
So, we take a new image (left12.jpg in this case. That is the first image in this chapter)
@code{.py}
img = cv.imread('left12.jpg')
h, w = img.shape[:2]
@ -169,7 +166,7 @@ newcameramtx, roi = cv.getOptimalNewCameraMatrix(mtx, dist, (w,h), 1, (w,h))
@endcode
#### 1. Using **cv.undistort()**
This is the shortest path. Just call the function and use ROI obtained above to crop the result.
This is the easiest way. Just call the function and use ROI obtained above to crop the result.
@code{.py}
# undistort
dst = cv.undistort(img, mtx, dist, None, newcameramtx)
@ -181,7 +178,7 @@ cv.imwrite('calibresult.png', dst)
@endcode
#### 2. Using **remapping**
This is curved path. First find a mapping function from distorted image to undistorted image. Then
This way is a little bit more difficult. First, find a mapping function from the distorted image to the undistorted image. Then
use the remap function.
@code{.py}
# undistort
@ -193,23 +190,22 @@ x, y, w, h = roi
dst = dst[y:y+h, x:x+w]
cv.imwrite('calibresult.png', dst)
@endcode
Both the methods give the same result. See the result below:
Still, both the methods give the same result. See the result below:
![image](images/calib_result.jpg)
You can see in the result that all the edges are straight.
Now you can store the camera matrix and distortion coefficients using write functions in Numpy
Now you can store the camera matrix and distortion coefficients using write functions in NumPy
(np.savez, np.savetxt etc) for future uses.
Re-projection Error
-------------------
Re-projection error gives a good estimation of just how exact is the found parameters. This should
be as close to zero as possible. Given the intrinsic, distortion, rotation and translation matrices,
we first transform the object point to image point using **cv.projectPoints()**. Then we calculate
Re-projection error gives a good estimation of just how exact the found parameters are. The closer the re-projection error is to zero, the more accurate the parameters we found are. Given the intrinsic, distortion, rotation and translation matrices,
we must first transform the object point to image point using **cv.projectPoints()**. Then, we can calculate
the absolute norm between what we got with our transformation and the corner finding algorithm. To
find the average error we calculate the arithmetical mean of the errors calculate for all the
find the average error, we calculate the arithmetical mean of the errors calculated for all the
calibration images.
@code{.py}
mean_error = 0

@ -126,9 +126,9 @@ Result looks like below:
Additional Resources
--------------------
-# Video Lecture on [Face Detection and Tracking](http://www.youtube.com/watch?v=WfdYYNamHZ8)
2. An interesting interview regarding Face Detection by [Adam
Harvey](http://www.makematics.com/research/viola-jones/)
-# Video Lecture on [Face Detection and Tracking](https://www.youtube.com/watch?v=WfdYYNamHZ8)
-# An interesting interview regarding Face Detection by [Adam
Harvey](https://web.archive.org/web/20171204220159/http://www.makematics.com/research/viola-jones/)
Exercises
---------

@ -27,7 +27,7 @@ merged, it has to be converted back to 8-bit to view it on usual displays. This
tonemapping. Additional complexities arise when objects of the scene or camera move between shots,
since images with different exposures should be registered and aligned.
In this tutorial we show 2 algorithms (Debvec, Robertson) to generate and display HDR image from an
In this tutorial we show 2 algorithms (Debevec, Robertson) to generate and display HDR image from an
exposure sequence, and demonstrate an alternative approach called exposure fusion (Mertens), that
produces low dynamic range image and does not need the exposure times data.
Furthermore, we estimate the camera response function (CRF) which is of great value for many computer
@ -65,14 +65,14 @@ exposure_times = np.array([15.0, 2.5, 0.25, 0.0333], dtype=np.float32)
### 2. Merge exposures into HDR image
In this stage we merge the exposure sequence into one HDR image, showing 2 possibilities
which we have in OpenCV. The first method is Debvec and the second one is Robertson.
which we have in OpenCV. The first method is Debevec and the second one is Robertson.
Notice that the HDR image is of type float32, and not uint8, as it contains the
full dynamic range of all exposure images.
@code{.py}
# Merge exposures to HDR image
merge_debvec = cv.createMergeDebevec()
hdr_debvec = merge_debvec.process(img_list, times=exposure_times.copy())
merge_debevec = cv.createMergeDebevec()
hdr_debevec = merge_debevec.process(img_list, times=exposure_times.copy())
merge_robertson = cv.createMergeRobertson()
hdr_robertson = merge_robertson.process(img_list, times=exposure_times.copy())
@endcode
@ -86,7 +86,7 @@ we will later have to clip the data in order to avoid overflow.
@code{.py}
# Tonemap HDR image
tonemap1 = cv.createTonemapDurand(gamma=2.2)
res_debvec = tonemap1.process(hdr_debvec.copy())
res_debevec = tonemap1.process(hdr_debevec.copy())
tonemap2 = cv.createTonemapDurand(gamma=1.3)
res_robertson = tonemap2.process(hdr_robertson.copy())
@endcode
@ -111,11 +111,11 @@ integers in the range of [0..255].
@code{.py}
# Convert datatype to 8-bit and save
res_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')
res_debevec_8bit = np.clip(res_debevec*255, 0, 255).astype('uint8')
res_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')
res_mertens_8bit = np.clip(res_mertens*255, 0, 255).astype('uint8')
cv.imwrite("ldr_debvec.jpg", res_debvec_8bit)
cv.imwrite("ldr_debevec.jpg", res_debevec_8bit)
cv.imwrite("ldr_robertson.jpg", res_robertson_8bit)
cv.imwrite("fusion_mertens.jpg", res_mertens_8bit)
@endcode
@ -127,9 +127,9 @@ You can see the different results but consider that each algorithm have addition
extra parameters that you should fit to get your desired outcome. Best practice is
to try the different methods and see which one performs best for your scene.
### Debvec:
### Debevec:
![image](images/ldr_debvec.jpg)
![image](images/ldr_debevec.jpg)
### Robertson:
@ -150,9 +150,9 @@ function and use it for the HDR merge.
@code{.py}
# Estimate camera response function (CRF)
cal_debvec = cv.createCalibrateDebevec()
crf_debvec = cal_debvec.process(img_list, times=exposure_times)
hdr_debvec = merge_debvec.process(img_list, times=exposure_times.copy(), response=crf_debvec.copy())
cal_debevec = cv.createCalibrateDebevec()
crf_debevec = cal_debevec.process(img_list, times=exposure_times)
hdr_debevec = merge_debevec.process(img_list, times=exposure_times.copy(), response=crf_debevec.copy())
cal_robertson = cv.createCalibrateRobertson()
crf_robertson = cal_robertson.process(img_list, times=exposure_times)
hdr_robertson = merge_robertson.process(img_list, times=exposure_times.copy(), response=crf_robertson.copy())
@ -166,12 +166,12 @@ For this sequence we got the following estimation:
Additional Resources
--------------------
1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008.
2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999.
3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007.
1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008. @cite DM97
2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999. @cite RB99
3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007. @cite MK07
4. Images from [Wikipedia-HDR](https://en.wikipedia.org/wiki/High-dynamic-range_imaging)
Exercises
---------
1. Try all tonemap algorithms: [Drago](http://docs.opencv.org/master/da/d53/classcv_1_1TonemapDrago.html), [Durand](http://docs.opencv.org/master/da/d3d/classcv_1_1TonemapDurand.html), [Mantiuk](http://docs.opencv.org/master/de/d76/classcv_1_1TonemapMantiuk.html) and [Reinhard](http://docs.opencv.org/master/d0/dec/classcv_1_1TonemapReinhard.html).
2. Try changing the parameters in the HDR calibration and tonemap methods.
1. Try all tonemap algorithms: cv::TonemapDrago, cv::TonemapDurand, cv::TonemapMantiuk and cv::TonemapReinhard
2. Try changing the parameters in the HDR calibration and tonemap methods.

@ -15,55 +15,167 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java)
@include samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py)
@include samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py
@end_toggle
Explanation
-----------
The main function is rather simple, as follows from the comments we do the following:
-# Open the image, convert it into grayscale and blur it to get rid of the noise.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp setup
-# Create a window with header "Source" and display the source file in it.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp createWindow
-# Create a trackbar on the source_window and assign a callback function to it
- Open the image, convert it into grayscale and blur it to get rid of the noise.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp setup
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java setup
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py setup
@end_toggle
- Create a window with header "Source" and display the source file in it.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp createWindow
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java createWindow
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py createWindow
@end_toggle
- Create a trackbar on the `source_window` and assign a callback function to it.
In general callback functions are used to react to some kind of signal, in our
case it's trackbar's state change.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp taskbar
-# Explicit one-time call of `thresh_callback` is necessary to display
Explicit one-time call of `thresh_callback` is necessary to display
the "Contours" window simultaniously with the "Source" window.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp callback00
-# Wait for user to close the windows.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp waitForIt
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp trackbar
@end_toggle
The callback function `thresh_callback` does all the interesting job.
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java trackbar
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py trackbar
@end_toggle
-# Writes to `threshold_output` the threshold of the grayscale picture (you can check out about thresholding @ref tutorial_threshold "here").
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp threshold
-# Finds contours and saves them to the vectors `contour` and `hierarchy`.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp findContours
-# For every found contour we now apply approximation to polygons
with accuracy +-3 and stating that the curve must me closed.
The callback function does all the interesting job.
After that we find a bounding rect for every polygon and save it to `boundRect`.
- Use @ref cv::Canny to detect edges in the images.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp Canny
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java Canny
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py Canny
@end_toggle
- Finds contours and saves them to the vectors `contour` and `hierarchy`.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp findContours
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java findContours
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py findContours
@end_toggle
- For every found contour we now apply approximation to polygons
with accuracy +-3 and stating that the curve must be closed.
After that we find a bounding rect for every polygon and save it to `boundRect`.
At last we find a minimum enclosing circle for every polygon and
save it to `center` and `radius` vectors.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp allthework
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp allthework
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java allthework
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py allthework
@end_toggle
We found everything we need, all we have to do is to draw.
-# Create new Mat of unsigned 8-bit chars, filled with zeros.
- Create new Mat of unsigned 8-bit chars, filled with zeros.
It will contain all the drawings we are going to make (rects and circles).
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp zeroMat
-# For every contour: pick a random color, draw the contour, the bounding rectangle and
the minimal enclosing circle with it,
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp forContour
-# Display the results: create a new window "Contours" and show everything we added to drawings on it.
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp showDrawings
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp zeroMat
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java zeroMat
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py zeroMat
@end_toggle
- For every contour: pick a random color, draw the contour, the bounding rectangle and
the minimal enclosing circle with it.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp forContour
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java forContour
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py forContour
@end_toggle
- Display the results: create a new window "Contours" and show everything we added to drawings on it.
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo1.cpp showDrawings
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/ShapeDescriptors/bounding_rects_circles/GeneralContoursDemo1.java showDrawings
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/ShapeDescriptors/bounding_rects_circles/generalContours_demo1.py showDrawings
@end_toggle
Result
------

@ -15,9 +15,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/generalContours_demo2.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/GeneralContoursDemo2.java)
@include samples/java/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/GeneralContoursDemo2.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py)
@include samples/python/tutorial_code/ShapeDescriptors/bounding_rotated_ellipses/generalContours_demo2.py
@end_toggle
Explanation
-----------

@ -15,9 +15,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/find_contours/FindContoursDemo.java)
@include samples/java/tutorial_code/ShapeDescriptors/find_contours/FindContoursDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/find_contours/findContours_demo.py)
@include samples/python/tutorial_code/ShapeDescriptors/find_contours/findContours_demo.py
@end_toggle
Explanation
-----------

@ -14,10 +14,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/hull/HullDemo.java)
@include samples/java/tutorial_code/ShapeDescriptors/hull/HullDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/hull/hull_demo.py)
@include samples/python/tutorial_code/ShapeDescriptors/hull/hull_demo.py
@end_toggle
Explanation
-----------

@ -16,9 +16,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/moments_demo.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/moments/MomentsDemo.java)
@include samples/java/tutorial_code/ShapeDescriptors/moments/MomentsDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/moments/moments_demo.py)
@include samples/python/tutorial_code/ShapeDescriptors/moments/moments_demo.py
@end_toggle
Explanation
-----------

@ -14,9 +14,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp)
@include samples/cpp/tutorial_code/ShapeDescriptors/pointPolygonTest_demo.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ShapeDescriptors/point_polygon_test/PointPolygonTestDemo.java)
@include samples/java/tutorial_code/ShapeDescriptors/point_polygon_test/PointPolygonTestDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/ShapeDescriptors/point_polygon_test/pointPolygonTest_demo.py)
@include samples/python/tutorial_code/ShapeDescriptors/point_polygon_test/pointPolygonTest_demo.py
@end_toggle
Explanation
-----------

@ -225,6 +225,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_find_contours
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán
@ -233,6 +235,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_hull
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán
@ -241,6 +245,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_bounding_rects_circles
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán
@ -249,6 +255,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_bounding_rotated_ellipses
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán
@ -257,6 +265,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_moments
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán
@ -265,6 +275,8 @@ In this section you will learn about the image processing (manipulation) functio
- @subpage tutorial_point_polygon_test
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán

@ -86,3 +86,16 @@ When you run the code you should see 3x3 identity matrix as output.
That is it, whenever you start a new project just add the OpenCV user library that you have defined
to your project and you are good to go. Enjoy your powerful, less painful development environment :)
Running Java code with OpenCV and MKL dependency
------------------------------------------------
You may get the following error (e.g. on Ubuntu) if you have built OpenCV with MKL library with some Java code that calls OpenCV functions
that use Intel MKL:
> Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so.
One solution to solve this on Linux consists in preloading the Intel MKL library (either run the command in a terminal or add it to your `.bashrc` file).
Your command line should be something similar to this (add `$LD_PRELOAD:` before if you have already set the `LD_PRELOAD` variable):
> export LD_PRELOAD=/opt/intel/mkl/lib/intel64/libmkl_core.so:/opt/intel/mkl/lib/intel64/libmkl_sequential.so
Then, run the Eclipse IDE from a terminal that have this environment variable set (`echo $LD_PRELOAD`) and the error should disappear.

@ -17,9 +17,23 @@ Theory
Code
----
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp)
@include samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/objectDetection/cascade_classifier/ObjectDetectionDemo.java)
@include samples/java/tutorial_code/objectDetection/cascade_classifier/ObjectDetectionDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/objectDetection/cascade_classifier/objectDetection.py)
@include samples/python/tutorial_code/objectDetection/cascade_classifier/objectDetection.py
@end_toggle
Explanation
-----------
@ -40,3 +54,13 @@ Result
detection. For the eyes we keep using the file used in the tutorial.
![](images/Cascade_Classifier_Tutorial_Result_LBP.jpg)
Additional Resources
--------------------
-# Paul Viola and Michael J. Jones. Robust real-time face detection. International Journal of Computer Vision, 57(2):137–154, 2004. @cite Viola04
-# Rainer Lienhart and Jochen Maydt. An extended set of haar-like features for rapid object detection. In Image Processing. 2002. Proceedings. 2002 International Conference on, volume 1, pages I–900. IEEE, 2002. @cite Lienhart02
-# Video Lecture on [Face Detection and Tracking](https://www.youtube.com/watch?v=WfdYYNamHZ8)
-# An interesting interview regarding Face Detection by [Adam
Harvey](https://web.archive.org/web/20171204220159/http://www.makematics.com/research/viola-jones/)
-# [OpenCV Face Detection: Visualized](https://vimeo.com/12774628) on Vimeo by Adam Harvey

@ -5,6 +5,8 @@ Ever wondered how your digital camera detects peoples and faces? Look here to fi
- @subpage tutorial_cascade_classifier
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán

@ -31,21 +31,51 @@ Exposure sequence
Source Code
-----------
@include cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp)
@include samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java)
@include samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py)
@include samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py
@end_toggle
Sample images
-------------
Data directory that contains images, exposure times and `list.txt` file can be downloaded from
[here](https://github.com/opencv/opencv_extra/tree/master/testdata/cv/hdr/exposures).
Explanation
-----------
-# **Load images and exposure times**
@code{.cpp}
vector<Mat> images;
vector<float> times;
loadExposureSeq(argv[1], images, times);
@endcode
Firstly we load input images and exposure times from user-defined folder. The folder should
contain images and *list.txt* - file that contains file names and inverse exposure times.
- **Load images and exposure times**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Load images and exposure times
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Load images and exposure times
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Load images and exposure times
@end_toggle
Firstly we load input images and exposure times from user-defined folder. The folder should
contain images and *list.txt* - file that contains file names and inverse exposure times.
For our image sequence the list is following:
For our image sequence the list is following:
@code{.none}
memorial00.png 0.03125
memorial01.png 0.0625
@ -53,53 +83,96 @@ Explanation
memorial15.png 1024
@endcode
-# **Estimate camera response**
@code{.cpp}
Mat response;
Ptr<CalibrateDebevec> calibrate = createCalibrateDebevec();
calibrate->process(images, response, times);
@endcode
It is necessary to know camera response function (CRF) for a lot of HDR construction algorithms.
We use one of the calibration algorithms to estimate inverse CRF for all 256 pixel values.
-# **Make HDR image**
@code{.cpp}
Mat hdr;
Ptr<MergeDebevec> merge_debevec = createMergeDebevec();
merge_debevec->process(images, hdr, times, response);
@endcode
- **Estimate camera response**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Estimate camera response
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Estimate camera response
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Estimate camera response
@end_toggle
It is necessary to know camera response function (CRF) for a lot of HDR construction algorithms.
We use one of the calibration algorithms to estimate inverse CRF for all 256 pixel values.
- **Make HDR image**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Make HDR image
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Make HDR image
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Make HDR image
@end_toggle
We use Debevec's weighting scheme to construct HDR image using response calculated in the previous
item.
-# **Tonemap HDR image**
@code{.cpp}
Mat ldr;
Ptr<TonemapDurand> tonemap = createTonemapDurand(2.2f);
tonemap->process(hdr, ldr);
@endcode
Since we want to see our results on common LDR display we have to map our HDR image to 8-bit range
preserving most details. It is the main goal of tonemapping methods. We use tonemapper with
bilateral filtering and set 2.2 as the value for gamma correction.
-# **Perform exposure fusion**
@code{.cpp}
Mat fusion;
Ptr<MergeMertens> merge_mertens = createMergeMertens();
merge_mertens->process(images, fusion);
@endcode
There is an alternative way to merge our exposures in case when we don't need HDR image. This
process is called exposure fusion and produces LDR image that doesn't require gamma correction. It
also doesn't use exposure values of the photographs.
-# **Write results**
@code{.cpp}
imwrite("fusion.png", fusion * 255);
imwrite("ldr.png", ldr * 255);
imwrite("hdr.hdr", hdr);
@endcode
Now it's time to look at the results. Note that HDR image can't be stored in one of common image
formats, so we save it to Radiance image (.hdr). Also all HDR imaging functions return results in
[0, 1] range so we should multiply result by 255.
- **Tonemap HDR image**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Tonemap HDR image
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Tonemap HDR image
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Tonemap HDR image
@end_toggle
Since we want to see our results on common LDR display we have to map our HDR image to 8-bit range
preserving most details. It is the main goal of tonemapping methods. We use tonemapper with
bilateral filtering and set 2.2 as the value for gamma correction.
- **Perform exposure fusion**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Perform exposure fusion
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Perform exposure fusion
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Perform exposure fusion
@end_toggle
There is an alternative way to merge our exposures in case when we don't need HDR image. This
process is called exposure fusion and produces LDR image that doesn't require gamma correction. It
also doesn't use exposure values of the photographs.
- **Write results**
@add_toggle_cpp
@snippet samples/cpp/tutorial_code/photo/hdr_imaging/hdr_imaging.cpp Write results
@end_toggle
@add_toggle_java
@snippet samples/java/tutorial_code/photo/hdr_imaging/HDRImagingDemo.java Write results
@end_toggle
@add_toggle_python
@snippet samples/python/tutorial_code/photo/hdr_imaging/hdr_imaging.py Write results
@end_toggle
Now it's time to look at the results. Note that HDR image can't be stored in one of common image
formats, so we save it to Radiance image (.hdr). Also all HDR imaging functions return results in
[0, 1] range so we should multiply result by 255.
You can try other tonemap algorithms: cv::TonemapDrago, cv::TonemapDurand, cv::TonemapMantiuk and cv::TonemapReinhard
You can also adjust the parameters in the HDR calibration and tonemap methods for your own photos.
Results
-------
@ -111,3 +184,12 @@ Results
### Exposure fusion
![](images/fusion.png)
Additional Resources
--------------------
1. Paul E Debevec and Jitendra Malik. Recovering high dynamic range radiance maps from photographs. In ACM SIGGRAPH 2008 classes, page 31. ACM, 2008. @cite DM97
2. Mark A Robertson, Sean Borman, and Robert L Stevenson. Dynamic range improvement through multiple exposures. In Image Processing, 1999. ICIP 99. Proceedings. 1999 International Conference on, volume 3, pages 159–163. IEEE, 1999. @cite RB99
3. Tom Mertens, Jan Kautz, and Frank Van Reeth. Exposure fusion. In Computer Graphics and Applications, 2007. PG'07. 15th Pacific Conference on, pages 382–390. IEEE, 2007. @cite MK07
4. [Wikipedia-HDR](https://en.wikipedia.org/wiki/High-dynamic-range_imaging)
5. [Recovering High Dynamic Range Radiance Maps from Photographs (webpage)](http://www.pauldebevec.com/Research/HDR/)

@ -5,6 +5,8 @@ Use OpenCV for advanced photo processing.
- @subpage tutorial_hdr_imaging
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 3.0
*Author:* Fedor Morozov

@ -947,12 +947,16 @@ public class CoreTest extends OpenCVTestCase {
}
public void testMahalanobis() {
Mat src = new Mat(matSize, matSize, CvType.CV_32F);
Core.randu(src, -128, 128);
Mat covar = new Mat(matSize, matSize, CvType.CV_32F);
Mat mean = new Mat(1, matSize, CvType.CV_32F);
Core.calcCovarMatrix(grayRnd_32f, covar, mean, Core.COVAR_ROWS | Core.COVAR_NORMAL, CvType.CV_32F);
Core.calcCovarMatrix(src, covar, mean, Core.COVAR_ROWS | Core.COVAR_NORMAL, CvType.CV_32F);
covar = covar.inv();
Mat line1 = grayRnd_32f.row(0);
Mat line2 = grayRnd_32f.row(1);
Mat line1 = src.row(0);
Mat line2 = src.row(1);
double d = Core.Mahalanobis(line1, line1, covar);

@ -463,9 +463,14 @@ static bool ipp_Mat_setTo_Mat(Mat &dst, Mat &_val, Mat &mask)
return false;
if (dst.depth() == CV_32F)
{
for (int i = 0; i < (int)(_val.total()); i++)
if (_val.at<double>(i) < iwTypeGetMin(ipp32f) || _val.at<double>(i) > iwTypeGetMax(ipp32f))
{
float v = (float)(_val.at<double>(i)); // cast to float
if (cvIsNaN(v) || cvIsInf(v)) // accept finite numbers only
return false;
}
}
if(dst.dims <= 2)
{

@ -21,7 +21,13 @@ namespace logging {
static LogLevel parseLogLevelConfiguration()
{
static cv::String param_log_level = utils::getConfigurationParameterString("OPENCV_LOG_LEVEL", "INFO");
static cv::String param_log_level = utils::getConfigurationParameterString("OPENCV_LOG_LEVEL",
#if defined NDEBUG
"WARNING"
#else
"INFO"
#endif
);
if (param_log_level == "DISABLED" || param_log_level == "disabled" ||
param_log_level == "0" || param_log_level == "OFF" || param_log_level == "off")
return LOG_LEVEL_SILENT;

@ -736,7 +736,6 @@ int64 getCPUTickCount(void)
int64 getCPUTickCount(void)
{
int64 result = 0;
unsigned upper, lower, tmp;
__asm__ volatile(
"0: \n"

@ -1608,6 +1608,32 @@ TEST(Mat, regression_7873_mat_vector_initialize)
ASSERT_EQ(2, sub_mat.size[2]);
}
TEST(Mat, regression_10507_mat_setTo)
{
Size sz(6, 4);
Mat test_mask(sz, CV_8UC1, cv::Scalar::all(255));
test_mask.at<uchar>(1,0) = 0;
test_mask.at<uchar>(0,1) = 0;
for (int cn = 1; cn <= 4; cn++)
{
cv::Mat A(sz, CV_MAKE_TYPE(CV_32F, cn), cv::Scalar::all(5));
A.setTo(cv::Scalar::all(std::numeric_limits<float>::quiet_NaN()), test_mask);
int nans = 0;
for (int y = 0; y < A.rows; y++)
{
for (int x = 0; x < A.cols; x++)
{
for (int c = 0; c < cn; c++)
{
float v = A.ptr<float>(y, x)[c];
nans += (v == v) ? 0 : 1;
}
}
}
EXPECT_EQ(nans, cn * (sz.area() - 2)) << "A=" << A << std::endl << "mask=" << test_mask << std::endl;
}
}
TEST(Core_Mat_array, outputArray_create_getMat)
{
cv::Mat_<uchar> src_base(5, 1);

@ -565,14 +565,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
};
/**
* @brief Resize input 4-dimensional blob by nearest neighbor strategy.
* @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy.
*
* Layer is used to support TensorFlow's resize_nearest_neighbor op.
* Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops.
*/
class CV_EXPORTS ResizeNearestNeighborLayer : public Layer
class CV_EXPORTS ResizeLayer : public Layer
{
public:
static Ptr<ResizeNearestNeighborLayer> create(const LayerParams& params);
static Ptr<ResizeLayer> create(const LayerParams& params);
};
class CV_EXPORTS ProposalLayer : public Layer

@ -66,16 +66,22 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
/**
* @brief Enum of computation backends supported by layers.
* @see Net::setPreferableBackend
*/
enum Backend
{
//! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
//! OpenCV is built with Intel's Inference Engine library or
//! DNN_BACKEND_OPENCV otherwise.
DNN_BACKEND_DEFAULT,
DNN_BACKEND_HALIDE,
DNN_BACKEND_INFERENCE_ENGINE
DNN_BACKEND_INFERENCE_ENGINE,
DNN_BACKEND_OPENCV
};
/**
* @brief Enum of target devices for computations.
* @see Net::setPreferableTarget
*/
enum Target
{
@ -460,6 +466,9 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
* @brief Ask network to use specific computation backend where it supported.
* @param[in] backendId backend identifier.
* @see Backend
*
* If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
* means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
*/
CV_WRAP void setPreferableBackend(int backendId);
@ -467,6 +476,14 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
* @brief Ask network to make computations on specific target device.
* @param[in] targetId target identifier.
* @see Target
*
* List of supported combinations backend / target:
* | | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE |
* |------------------------|--------------------|------------------------------|--------------------|
* | DNN_TARGET_CPU | + | + | + |
* | DNN_TARGET_OPENCL | + | + | + |
* | DNN_TARGET_OPENCL_FP16 | + | + | |
* | DNN_TARGET_MYRIAD | | + | |
*/
CV_WRAP void setPreferableTarget(int targetId);

@ -10,9 +10,11 @@
#include "opencv2/dnn/shape_utils.hpp"
#include "../test/test_common.hpp"
namespace opencv_test {
CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, DNN_BACKEND_OPENCV)
CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD)
class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<DNNBackend, DNNTarget> >
@ -29,32 +31,10 @@ public:
target = (dnn::Target)(int)get<1>(GetParam());
}
static bool checkMyriadTarget()
{
#ifndef HAVE_INF_ENGINE
return false;
#endif
cv::dnn::Net net;
cv::dnn::LayerParams lp;
net.addLayerToPrev("testLayer", "Identity", lp);
net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
net.setPreferableTarget(cv::dnn::DNN_TARGET_MYRIAD);
net.setInput(cv::Mat::zeros(1, 1, CV_32FC1));
try
{
net.forward();
}
catch(...)
{
return false;
}
return true;
}
void processNet(std::string weights, std::string proto, std::string halide_scheduler,
const Mat& input, const std::string& outputLayer = "")
{
if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
{
#if defined(HAVE_OPENCL)
if (!cv::ocl::useOpenCL())
@ -149,7 +129,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
PERF_TEST_P_(DNNTestNetwork, ENet)
{
if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/Enet-model-best.net", "", "enet.yml",
Mat(cv::Size(512, 256), CV_32FC3));
@ -164,7 +144,8 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
PERF_TEST_P_(DNNTestNetwork, OpenFace)
{
if (backend == DNN_BACKEND_HALIDE ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
throw SkipTestException("");
processNet("dnn/openface_nn4.small2.v1.t7", "", "",
Mat(cv::Size(96, 96), CV_32FC3));
@ -178,13 +159,19 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
Mat(cv::Size(300, 300), CV_32FC3));
}
// TODO: update MobileNet model.
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE ||
backend == DNN_BACKEND_INFERENCE_ENGINE)
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_mobilenet_v1_coco.pb", "ssd_mobilenet_v1_coco.pbtxt", "",
processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
@ -237,9 +224,7 @@ PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
@ -256,6 +241,23 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
processNet("dnn/yolov3.cfg", "dnn/yolov3.weights", "", inp / 255);
}
PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
{
if (backend == DNN_BACKEND_HALIDE ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
{
if (backend == DNN_BACKEND_HALIDE ||
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
throw SkipTestException("");
processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
}
const tuple<DNNBackend, DNNTarget> testCases[] = {
#ifdef HAVE_HALIDE
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
@ -267,9 +269,9 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
#endif
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
};
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));

@ -395,9 +395,10 @@ namespace cv {
{
cv::dnn::LayerParams param;
param.name = "Upsample-name";
param.type = "ResizeNearestNeighbor";
param.type = "Resize";
param.set<int>("zoom_factor", scaleFactor);
param.set<String>("interpolation", "nearest");
darknet::LayerParameter lp;
std::string layer_name = cv::format("upsample_%d", layer_id);

@ -225,7 +225,7 @@ void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
class OpenCLBackendWrapper : public BackendWrapper
{
public:
OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
{
m.copyTo(umat);
host = &m;
@ -233,7 +233,7 @@ public:
}
OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
: BackendWrapper(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
: BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
{
Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
CV_Assert(!base.empty());
@ -654,7 +654,7 @@ private:
static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
{
if (backendId == DNN_BACKEND_DEFAULT)
if (backendId == DNN_BACKEND_OPENCV)
{
if (targetId == DNN_TARGET_CPU)
return Ptr<BackendWrapper>();
@ -727,7 +727,7 @@ struct Net::Impl
Ptr<BackendWrapper> wrap(Mat& host)
{
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU)
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
return Ptr<BackendWrapper>();
MatShape shape(host.dims);
@ -738,7 +738,7 @@ struct Net::Impl
if (backendWrappers.find(data) != backendWrappers.end())
{
Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
if (preferableBackend == DNN_BACKEND_DEFAULT)
if (preferableBackend == DNN_BACKEND_OPENCV)
{
CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
return OpenCLBackendWrapper::create(baseBuffer, host);
@ -850,9 +850,27 @@ struct Net::Impl
{
CV_TRACE_FUNCTION();
if (preferableBackend == DNN_BACKEND_DEFAULT)
#ifdef HAVE_INF_ENGINE
preferableBackend = DNN_BACKEND_INFERENCE_ENGINE;
#else
preferableBackend = DNN_BACKEND_OPENCV;
#endif
CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
preferableTarget == DNN_TARGET_CPU ||
preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16);
CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
preferableTarget == DNN_TARGET_CPU ||
preferableTarget == DNN_TARGET_OPENCL);
CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
preferableTarget == DNN_TARGET_CPU ||
preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16 ||
preferableTarget == DNN_TARGET_MYRIAD);
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
{
if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
#ifndef HAVE_OPENCL
{
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
@ -1036,7 +1054,7 @@ struct Net::Impl
void initBackend()
{
CV_TRACE_FUNCTION();
if (preferableBackend == DNN_BACKEND_DEFAULT)
if (preferableBackend == DNN_BACKEND_OPENCV)
CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
else if (preferableBackend == DNN_BACKEND_HALIDE)
initHalideBackend();
@ -1375,7 +1393,7 @@ struct Net::Impl
std::vector<LayerPin> pinsForInternalBlobs;
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableBackend == DNN_BACKEND_OPENCV &&
preferableTarget == DNN_TARGET_OPENCL_FP16);
ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
for (int i = 0; i < ld.outputBlobs.size(); ++i)
@ -1418,7 +1436,7 @@ struct Net::Impl
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{
if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT &&
if( !fusion || preferableBackend != DNN_BACKEND_OPENCV &&
preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
return;
@ -1446,7 +1464,7 @@ struct Net::Impl
// some other layers.
// TODO: OpenCL target support more fusion styles.
if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
ld.layerInstance->type != "MVN")) )
continue;
@ -1481,7 +1499,7 @@ struct Net::Impl
break;
}
if (preferableBackend != DNN_BACKEND_DEFAULT)
if (preferableBackend != DNN_BACKEND_OPENCV)
continue; // Go to the next layer.
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
@ -1624,7 +1642,7 @@ struct Net::Impl
}
}
if (preferableBackend != DNN_BACKEND_DEFAULT)
if (preferableBackend != DNN_BACKEND_OPENCV)
continue; // Go to the next layer.
// the optimization #2. if there is no layer that takes max pooling layer's computed
@ -1735,7 +1753,7 @@ struct Net::Impl
{
CV_Assert(layers[0].outputBlobs[i].total());
if (layers[0].outputBlobs[i].depth() == CV_32F &&
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableBackend == DNN_BACKEND_OPENCV &&
preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat mat = layers[0].outputBlobs[i].clone();
@ -1781,12 +1799,12 @@ struct Net::Impl
TickMeter tm;
tm.start();
if (preferableBackend == DNN_BACKEND_DEFAULT ||
if (preferableBackend == DNN_BACKEND_OPENCV ||
!layer->supportBackend(preferableBackend))
{
if( !ld.skip )
{
if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
{
std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@ -2132,7 +2150,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
{
std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
IS_DNN_OPENCL_TARGET(impl->preferableTarget))
{
if (impl->preferableTarget == DNN_TARGET_OPENCL)
@ -2234,7 +2252,13 @@ void Net::setPreferableTarget(int targetId)
if (IS_DNN_OPENCL_TARGET(targetId))
{
#ifndef HAVE_OPENCL
impl->preferableTarget = DNN_TARGET_CPU;
#ifdef HAVE_INF_ENGINE
if (impl->preferableBackend == DNN_BACKEND_OPENCV)
#else
if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
impl->preferableBackend == DNN_BACKEND_OPENCV)
#endif // HAVE_INF_ENGINE
impl->preferableTarget = DNN_TARGET_CPU;
#else
bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
@ -2270,7 +2294,7 @@ void Net::setInput(InputArray blob, const String& name)
ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
Mat blob_;
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat blob_mat = blob.getMat();
@ -2664,7 +2688,7 @@ int Layer::outputNameToIndex(const String&)
bool Layer::supportBackend(int backendId)
{
return backendId == DNN_BACKEND_DEFAULT;
return backendId == DNN_BACKEND_OPENCV;
}
Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)

@ -83,7 +83,7 @@ void initializeLayerFactory()
CV_DNN_REGISTER_LAYER_CLASS(Concat, ConcatLayer);
CV_DNN_REGISTER_LAYER_CLASS(Reshape, ReshapeLayer);
CV_DNN_REGISTER_LAYER_CLASS(Flatten, FlattenLayer);
CV_DNN_REGISTER_LAYER_CLASS(ResizeNearestNeighbor, ResizeNearestNeighborLayer);
CV_DNN_REGISTER_LAYER_CLASS(Resize, ResizeLayer);
CV_DNN_REGISTER_LAYER_CLASS(CropAndResize, CropAndResizeLayer);
CV_DNN_REGISTER_LAYER_CLASS(Convolution, ConvolutionLayer);

@ -96,6 +96,46 @@ public:
shift = bias_;
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
{
Mat w, b;
top->getScaleShift(w, b);
if (w.empty() && b.empty())
return false;
const int numChannels = weights_.total();
const int numFusedWeights = w.total();
const int numFusedBias = b.total();
if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
(numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
return false;
if (!w.empty())
{
w = w.reshape(1, 1);
if (numFusedWeights == 1)
{
multiply(weights_, w.at<float>(0), weights_);
multiply(bias_, w.at<float>(0), bias_);
}
else
{
multiply(weights_, w, weights_);
multiply(bias_, w, bias_);
}
}
if (!b.empty())
{
b = b.reshape(1, 1);
if (numFusedBias == 1)
add(bias_, b.at<float>(0), bias_);
else
add(bias_, b.reshape(1, 1), bias_);
}
return true;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
@ -109,7 +149,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
}

@ -56,7 +56,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
}

@ -103,7 +103,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding || // By channels
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !padding;
}

@ -81,9 +81,10 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
backendId == DNN_BACKEND_HALIDE && haveHalide() ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return preferableTarget != DNN_TARGET_MYRIAD || type != "Deconvolution" || adjustPad == Size();
else
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
@ -737,8 +738,9 @@ public:
if( relu )
{
r0 = relu[i];
r1 = relu[i+1];
r0 = relu[i]; r1 = relu[i+1];
if( i+1 >= outCn )
r1 = r0;
}
int j = 0;
@ -1568,6 +1570,39 @@ public:
return Ptr<BackendNode>();
}
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &) CV_OVERRIDE
{
#ifdef HAVE_INF_ENGINE
const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout
const int group = numOutput / outGroupCn;
InferenceEngine::LayerParams lp;
lp.name = name;
lp.type = "Deconvolution";
lp.precision = InferenceEngine::Precision::FP32;
std::shared_ptr<InferenceEngine::DeconvolutionLayer> ieLayer(new InferenceEngine::DeconvolutionLayer(lp));
ieLayer->_kernel_x = kernel.width;
ieLayer->_kernel_y = kernel.height;
ieLayer->_stride_x = stride.width;
ieLayer->_stride_y = stride.height;
ieLayer->_out_depth = numOutput;
ieLayer->_padding_x = pad.width;
ieLayer->_padding_y = pad.height;
ieLayer->_dilation_x = dilation.width;
ieLayer->_dilation_y = dilation.height;
ieLayer->_group = group;
ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
if (hasBias())
{
ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C);
}
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
#endif // HAVE_INF_ENGINE
return Ptr<BackendNode>();
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -68,7 +68,7 @@ public:
{
float input_y = top * (inpHeight - 1) + y * heightScale;
int y0 = static_cast<int>(input_y);
const float* inpData_row0 = (float*)inp.data + y0 * inpWidth;
const float* inpData_row0 = inp.ptr<float>(0, 0, y0);
const float* inpData_row1 = (y0 + 1 < inpHeight) ? (inpData_row0 + inpWidth) : inpData_row0;
for (int x = 0; x < outWidth; ++x)
{

@ -195,7 +195,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !_locPredTransposed;
}

@ -115,9 +115,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
backendId == DNN_BACKEND_HALIDE && haveHalide() ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
return func.supportBackend(backendId, this->preferableTarget);
}
virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
@ -238,6 +236,12 @@ struct ReLUFunctor
explicit ReLUFunctor(float slope_=1.f) : slope(slope_) {}
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
float s = slope;
@ -353,6 +357,12 @@ struct ReLU6Functor
CV_Assert(minValue <= maxValue);
}
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -445,6 +455,12 @@ struct TanHFunctor
{
typedef TanHLayer Layer;
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -496,8 +512,9 @@ struct TanHFunctor
#ifdef HAVE_INF_ENGINE
InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
{
CV_Error(Error::StsNotImplemented, "TanH");
return InferenceEngine::CNNLayerPtr();
lp.type = "TanH";
std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
return ieLayer;
}
#endif // HAVE_INF_ENGINE
@ -508,6 +525,12 @@ struct SigmoidFunctor
{
typedef SigmoidLayer Layer;
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
backendId == DNN_BACKEND_INFERENCE_ENGINE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -574,6 +597,11 @@ struct ELUFunctor
explicit ELUFunctor() {}
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -637,6 +665,11 @@ struct AbsValFunctor
{
typedef AbsLayer Layer;
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -700,6 +733,11 @@ struct BNLLFunctor
{
typedef BNLLLayer Layer;
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
@ -750,6 +788,14 @@ struct PowerFunctor
explicit PowerFunctor(float power_ = 1.f, float scale_ = 1.f, float shift_ = 0.f)
: power(power_), scale(scale_), shift(shift_) {}
bool supportBackend(int backendId, int targetId)
{
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return (targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16) || power == 1.0;
else
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
float a = scale, b = shift, p = power;
@ -852,6 +898,11 @@ struct ChannelsPReLUFunctor
scale_umat = scale.getUMat(ACCESS_READ);
}
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
}
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
CV_Assert(scale.isContinuous() && scale.type() == CV_32F);

@ -96,7 +96,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
}

@ -64,7 +64,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
}

@ -128,7 +128,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1;
}

@ -101,9 +101,13 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
if( relu )
{
r0 = relu[i];
r1 = relu[i+1];
r2 = relu[i+2];
r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
if( i+2 >= outCn )
{
r2 = r1;
if( i+1 >= outCn )
r2 = r1 = r0;
}
vr0 = _mm_set1_ps(r0);
vr1 = _mm_set1_ps(r1);
vr2 = _mm_set1_ps(r2);

@ -90,7 +90,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
}

@ -34,7 +34,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE && haveHalide() &&
!poolPad.width && !poolPad.height;
}

@ -63,7 +63,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_DEFAULT ||
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() &&
pnorm == 2 && !blobs.empty();
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save