mirror of https://github.com/opencv/opencv.git
Merge pull request #24274 from vrabaud:webp_1.3.2
Merge pull request #24274 from vrabaud:webp_1.3.2
Bump libwebp to 1.3.2 #24274
This is version [c1ffd9a](c1ffd9ac75
)
It is 1.3.2 with a few patches that were made right after to help compilation.
No need for patches on the OpenCV side!
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
pull/24279/head
parent
5e9191558d
commit
687fc11626
115 changed files with 5354 additions and 2784 deletions
@ -1,22 +0,0 @@ |
||||
diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h
|
||||
index de026a1d9e..a16c0bb300 100644
|
||||
--- a/3rdparty/libwebp/src/dsp/msa_macro.h
|
||||
+++ b/3rdparty/libwebp/src/dsp/msa_macro.h
|
||||
@@ -73,7 +73,7 @@
|
||||
static inline TYPE FUNC_NAME(const void* const psrc) { \
|
||||
const uint8_t* const psrc_m = (const uint8_t*)psrc; \
|
||||
TYPE val_m; \
|
||||
- asm volatile ( \
|
||||
+ __asm__ volatile ( \
|
||||
"" #INSTR " %[val_m], %[psrc_m] \n\t" \
|
||||
: [val_m] "=r" (val_m) \
|
||||
: [psrc_m] "m" (*psrc_m)); \
|
||||
@@ -86,7 +86,7 @@
|
||||
static inline void FUNC_NAME(TYPE val, void* const pdst) { \
|
||||
uint8_t* const pdst_m = (uint8_t*)pdst; \
|
||||
TYPE val_m = val; \
|
||||
- asm volatile ( \
|
||||
+ __asm__ volatile ( \
|
||||
" " #INSTR " %[val_m], %[pdst_m] \n\t" \
|
||||
: [pdst_m] "=m" (*pdst_m) \
|
||||
: [val_m] "r" (val_m)); \
|
@ -0,0 +1,565 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Sharp RGB to YUV conversion.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "sharpyuv/sharpyuv.h" |
||||
|
||||
#include <assert.h> |
||||
#include <limits.h> |
||||
#include <stddef.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
|
||||
#include "src/webp/types.h" |
||||
#include "sharpyuv/sharpyuv_cpu.h" |
||||
#include "sharpyuv/sharpyuv_dsp.h" |
||||
#include "sharpyuv/sharpyuv_gamma.h" |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
int SharpYuvGetVersion(void) { |
||||
return SHARPYUV_VERSION; |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Sharp RGB->YUV conversion
|
||||
|
||||
static const int kNumIterations = 4; |
||||
|
||||
#define YUV_FIX 16 // fixed-point precision for RGB->YUV
|
||||
static const int kYuvHalf = 1 << (YUV_FIX - 1); |
||||
|
||||
// Max bit depth so that intermediate calculations fit in 16 bits.
|
||||
static const int kMaxBitDepth = 14; |
||||
|
||||
// Returns the precision shift to use based on the input rgb_bit_depth.
|
||||
static int GetPrecisionShift(int rgb_bit_depth) { |
||||
// Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
|
||||
// bits if needed.
|
||||
return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2 |
||||
: (kMaxBitDepth - rgb_bit_depth); |
||||
} |
||||
|
||||
typedef int16_t fixed_t; // signed type with extra precision for UV
|
||||
typedef uint16_t fixed_y_t; // unsigned type with extra precision for W
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static uint8_t clip_8b(fixed_t v) { |
||||
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u; |
||||
} |
||||
|
||||
static uint16_t clip(fixed_t v, int max) { |
||||
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; |
||||
} |
||||
|
||||
static fixed_y_t clip_bit_depth(int y, int bit_depth) { |
||||
const int max = (1 << bit_depth) - 1; |
||||
return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max; |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int RGBToGray(int64_t r, int64_t g, int64_t b) { |
||||
const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf; |
||||
return (int)(luma >> YUV_FIX); |
||||
} |
||||
|
||||
static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
||||
int rgb_bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); |
||||
const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type); |
||||
const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type); |
||||
const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type); |
||||
const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type); |
||||
return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth, |
||||
transfer_type); |
||||
} |
||||
|
||||
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w, |
||||
int rgb_bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); |
||||
int i; |
||||
for (i = 0; i < w; ++i) { |
||||
const uint32_t R = |
||||
SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type); |
||||
const uint32_t G = |
||||
SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type); |
||||
const uint32_t B = |
||||
SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type); |
||||
const uint32_t Y = RGBToGray(R, G, B); |
||||
dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type); |
||||
} |
||||
} |
||||
|
||||
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2, |
||||
fixed_t* dst, int uv_w, int rgb_bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
int i; |
||||
for (i = 0; i < uv_w; ++i) { |
||||
const int r = |
||||
ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0], |
||||
src2[0 * uv_w + 1], rgb_bit_depth, transfer_type); |
||||
const int g = |
||||
ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0], |
||||
src2[2 * uv_w + 1], rgb_bit_depth, transfer_type); |
||||
const int b = |
||||
ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0], |
||||
src2[4 * uv_w + 1], rgb_bit_depth, transfer_type); |
||||
const int W = RGBToGray(r, g, b); |
||||
dst[0 * uv_w] = (fixed_t)(r - W); |
||||
dst[1 * uv_w] = (fixed_t)(g - W); |
||||
dst[2 * uv_w] = (fixed_t)(b - W); |
||||
dst += 1; |
||||
src1 += 2; |
||||
src2 += 2; |
||||
} |
||||
} |
||||
|
||||
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) { |
||||
int i; |
||||
assert(w > 0); |
||||
for (i = 0; i < w; ++i) { |
||||
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]); |
||||
} |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) { |
||||
const int v0 = (A * 3 + B + 2) >> 2; |
||||
return clip_bit_depth(v0 + W0, bit_depth); |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static WEBP_INLINE int Shift(int v, int shift) { |
||||
return (shift >= 0) ? (v << shift) : (v >> -shift); |
||||
} |
||||
|
||||
static void ImportOneRow(const uint8_t* const r_ptr, |
||||
const uint8_t* const g_ptr, |
||||
const uint8_t* const b_ptr, |
||||
int rgb_step, |
||||
int rgb_bit_depth, |
||||
int pic_width, |
||||
fixed_y_t* const dst) { |
||||
// Convert the rgb_step from a number of bytes to a number of uint8_t or
|
||||
// uint16_t values depending the bit depth.
|
||||
const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step; |
||||
int i; |
||||
const int w = (pic_width + 1) & ~1; |
||||
for (i = 0; i < pic_width; ++i) { |
||||
const int off = i * step; |
||||
const int shift = GetPrecisionShift(rgb_bit_depth); |
||||
if (rgb_bit_depth == 8) { |
||||
dst[i + 0 * w] = Shift(r_ptr[off], shift); |
||||
dst[i + 1 * w] = Shift(g_ptr[off], shift); |
||||
dst[i + 2 * w] = Shift(b_ptr[off], shift); |
||||
} else { |
||||
dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift); |
||||
dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift); |
||||
dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift); |
||||
} |
||||
} |
||||
if (pic_width & 1) { // replicate rightmost pixel
|
||||
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1]; |
||||
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1]; |
||||
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1]; |
||||
} |
||||
} |
||||
|
||||
static void InterpolateTwoRows(const fixed_y_t* const best_y, |
||||
const fixed_t* prev_uv, |
||||
const fixed_t* cur_uv, |
||||
const fixed_t* next_uv, |
||||
int w, |
||||
fixed_y_t* out1, |
||||
fixed_y_t* out2, |
||||
int rgb_bit_depth) { |
||||
const int uv_w = w >> 1; |
||||
const int len = (w - 1) >> 1; // length to filter
|
||||
int k = 3; |
||||
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); |
||||
while (k-- > 0) { // process each R/G/B segments in turn
|
||||
// special boundary case for i==0
|
||||
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth); |
||||
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth); |
||||
|
||||
SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1, |
||||
bit_depth); |
||||
SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1, |
||||
bit_depth); |
||||
|
||||
// special boundary case for i == w - 1 when w is even
|
||||
if (!(w & 1)) { |
||||
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1], |
||||
best_y[w - 1 + 0], bit_depth); |
||||
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1], |
||||
best_y[w - 1 + w], bit_depth); |
||||
} |
||||
out1 += w; |
||||
out2 += w; |
||||
prev_uv += uv_w; |
||||
cur_uv += uv_w; |
||||
next_uv += uv_w; |
||||
} |
||||
} |
||||
|
||||
static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b, |
||||
const int coeffs[4], int sfix) { |
||||
const int srounder = 1 << (YUV_FIX + sfix - 1); |
||||
const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b + |
||||
coeffs[3] + srounder; |
||||
return (luma >> (YUV_FIX + sfix)); |
||||
} |
||||
|
||||
static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv, |
||||
uint8_t* y_ptr, int y_stride, uint8_t* u_ptr, |
||||
int u_stride, uint8_t* v_ptr, int v_stride, |
||||
int rgb_bit_depth, |
||||
int yuv_bit_depth, int width, int height, |
||||
const SharpYuvConversionMatrix* yuv_matrix) { |
||||
int i, j; |
||||
const fixed_t* const best_uv_base = best_uv; |
||||
const int w = (width + 1) & ~1; |
||||
const int h = (height + 1) & ~1; |
||||
const int uv_w = w >> 1; |
||||
const int uv_h = h >> 1; |
||||
const int sfix = GetPrecisionShift(rgb_bit_depth); |
||||
const int yuv_max = (1 << yuv_bit_depth) - 1; |
||||
|
||||
for (best_uv = best_uv_base, j = 0; j < height; ++j) { |
||||
for (i = 0; i < width; ++i) { |
||||
const int off = (i >> 1); |
||||
const int W = best_y[i]; |
||||
const int r = best_uv[off + 0 * uv_w] + W; |
||||
const int g = best_uv[off + 1 * uv_w] + W; |
||||
const int b = best_uv[off + 2 * uv_w] + W; |
||||
const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix); |
||||
if (yuv_bit_depth <= 8) { |
||||
y_ptr[i] = clip_8b(y); |
||||
} else { |
||||
((uint16_t*)y_ptr)[i] = clip(y, yuv_max); |
||||
} |
||||
} |
||||
best_y += w; |
||||
best_uv += (j & 1) * 3 * uv_w; |
||||
y_ptr += y_stride; |
||||
} |
||||
for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) { |
||||
for (i = 0; i < uv_w; ++i) { |
||||
const int off = i; |
||||
// Note r, g and b values here are off by W, but a constant offset on all
|
||||
// 3 components doesn't change the value of u and v with a YCbCr matrix.
|
||||
const int r = best_uv[off + 0 * uv_w]; |
||||
const int g = best_uv[off + 1 * uv_w]; |
||||
const int b = best_uv[off + 2 * uv_w]; |
||||
const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix); |
||||
const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix); |
||||
if (yuv_bit_depth <= 8) { |
||||
u_ptr[i] = clip_8b(u); |
||||
v_ptr[i] = clip_8b(v); |
||||
} else { |
||||
((uint16_t*)u_ptr)[i] = clip(u, yuv_max); |
||||
((uint16_t*)v_ptr)[i] = clip(v, yuv_max); |
||||
} |
||||
} |
||||
best_uv += 3 * uv_w; |
||||
u_ptr += u_stride; |
||||
v_ptr += v_stride; |
||||
} |
||||
return 1; |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Main function
|
||||
|
||||
static void* SafeMalloc(uint64_t nmemb, size_t size) { |
||||
const uint64_t total_size = nmemb * (uint64_t)size; |
||||
if (total_size != (size_t)total_size) return NULL; |
||||
return malloc((size_t)total_size); |
||||
} |
||||
|
||||
#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T))) |
||||
|
||||
static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, |
||||
const uint8_t* b_ptr, int rgb_step, int rgb_stride, |
||||
int rgb_bit_depth, uint8_t* y_ptr, int y_stride, |
||||
uint8_t* u_ptr, int u_stride, uint8_t* v_ptr, |
||||
int v_stride, int yuv_bit_depth, int width, |
||||
int height, |
||||
const SharpYuvConversionMatrix* yuv_matrix, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
// we expand the right/bottom border if needed
|
||||
const int w = (width + 1) & ~1; |
||||
const int h = (height + 1) & ~1; |
||||
const int uv_w = w >> 1; |
||||
const int uv_h = h >> 1; |
||||
uint64_t prev_diff_y_sum = ~0; |
||||
int j, iter; |
||||
|
||||
// TODO(skal): allocate one big memory chunk. But for now, it's easier
|
||||
// for valgrind debugging to have several chunks.
|
||||
fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
|
||||
fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t); |
||||
fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t); |
||||
fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t); |
||||
fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t); |
||||
fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t); |
||||
fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t); |
||||
fixed_y_t* best_y = best_y_base; |
||||
fixed_y_t* target_y = target_y_base; |
||||
fixed_t* best_uv = best_uv_base; |
||||
fixed_t* target_uv = target_uv_base; |
||||
const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h); |
||||
int ok; |
||||
assert(w > 0); |
||||
assert(h > 0); |
||||
|
||||
if (best_y_base == NULL || best_uv_base == NULL || |
||||
target_y_base == NULL || target_uv_base == NULL || |
||||
best_rgb_y == NULL || best_rgb_uv == NULL || |
||||
tmp_buffer == NULL) { |
||||
ok = 0; |
||||
goto End; |
||||
} |
||||
|
||||
// Import RGB samples to W/RGB representation.
|
||||
for (j = 0; j < height; j += 2) { |
||||
const int is_last_row = (j == height - 1); |
||||
fixed_y_t* const src1 = tmp_buffer + 0 * w; |
||||
fixed_y_t* const src2 = tmp_buffer + 3 * w; |
||||
|
||||
// prepare two rows of input
|
||||
ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width, |
||||
src1); |
||||
if (!is_last_row) { |
||||
ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride, |
||||
rgb_step, rgb_bit_depth, width, src2); |
||||
} else { |
||||
memcpy(src2, src1, 3 * w * sizeof(*src2)); |
||||
} |
||||
StoreGray(src1, best_y + 0, w); |
||||
StoreGray(src2, best_y + w, w); |
||||
|
||||
UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type); |
||||
UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type); |
||||
UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type); |
||||
memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv)); |
||||
best_y += 2 * w; |
||||
best_uv += 3 * uv_w; |
||||
target_y += 2 * w; |
||||
target_uv += 3 * uv_w; |
||||
r_ptr += 2 * rgb_stride; |
||||
g_ptr += 2 * rgb_stride; |
||||
b_ptr += 2 * rgb_stride; |
||||
} |
||||
|
||||
// Iterate and resolve clipping conflicts.
|
||||
for (iter = 0; iter < kNumIterations; ++iter) { |
||||
const fixed_t* cur_uv = best_uv_base; |
||||
const fixed_t* prev_uv = best_uv_base; |
||||
uint64_t diff_y_sum = 0; |
||||
|
||||
best_y = best_y_base; |
||||
best_uv = best_uv_base; |
||||
target_y = target_y_base; |
||||
target_uv = target_uv_base; |
||||
for (j = 0; j < h; j += 2) { |
||||
fixed_y_t* const src1 = tmp_buffer + 0 * w; |
||||
fixed_y_t* const src2 = tmp_buffer + 3 * w; |
||||
{ |
||||
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0); |
||||
InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, |
||||
src1, src2, rgb_bit_depth); |
||||
prev_uv = cur_uv; |
||||
cur_uv = next_uv; |
||||
} |
||||
|
||||
UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type); |
||||
UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type); |
||||
UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type); |
||||
|
||||
// update two rows of Y and one row of RGB
|
||||
diff_y_sum += |
||||
SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, |
||||
rgb_bit_depth + GetPrecisionShift(rgb_bit_depth)); |
||||
SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w); |
||||
|
||||
best_y += 2 * w; |
||||
best_uv += 3 * uv_w; |
||||
target_y += 2 * w; |
||||
target_uv += 3 * uv_w; |
||||
} |
||||
// test exit condition
|
||||
if (iter > 0) { |
||||
if (diff_y_sum < diff_y_threshold) break; |
||||
if (diff_y_sum > prev_diff_y_sum) break; |
||||
} |
||||
prev_diff_y_sum = diff_y_sum; |
||||
} |
||||
|
||||
// final reconstruction
|
||||
ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr, |
||||
u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth, |
||||
width, height, yuv_matrix); |
||||
|
||||
End: |
||||
free(best_y_base); |
||||
free(best_uv_base); |
||||
free(target_y_base); |
||||
free(target_uv_base); |
||||
free(best_rgb_y); |
||||
free(best_rgb_uv); |
||||
free(tmp_buffer); |
||||
return ok; |
||||
} |
||||
#undef SAFE_ALLOC |
||||
|
||||
#if defined(WEBP_USE_THREAD) && !defined(_WIN32) |
||||
#include <pthread.h> // NOLINT |
||||
|
||||
#define LOCK_ACCESS \ |
||||
static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
|
||||
if (pthread_mutex_lock(&sharpyuv_lock)) return |
||||
#define UNLOCK_ACCESS_AND_RETURN \ |
||||
do { \
|
||||
(void)pthread_mutex_unlock(&sharpyuv_lock); \
|
||||
return; \
|
||||
} while (0) |
||||
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
|
||||
#define LOCK_ACCESS do {} while (0) |
||||
#define UNLOCK_ACCESS_AND_RETURN return |
||||
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
|
||||
|
||||
// Hidden exported init function.
|
||||
// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
|
||||
// users can declare it as extern and call it with an alternate VP8CPUInfo
|
||||
// function.
|
||||
extern VP8CPUInfo SharpYuvGetCPUInfo; |
||||
SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func); |
||||
void SharpYuvInit(VP8CPUInfo cpu_info_func) { |
||||
static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used = |
||||
(VP8CPUInfo)&sharpyuv_last_cpuinfo_used; |
||||
LOCK_ACCESS; |
||||
// Only update SharpYuvGetCPUInfo when called from external code to avoid a
|
||||
// race on reading the value in SharpYuvConvert().
|
||||
if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) { |
||||
SharpYuvGetCPUInfo = cpu_info_func; |
||||
} |
||||
if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) { |
||||
UNLOCK_ACCESS_AND_RETURN; |
||||
} |
||||
|
||||
SharpYuvInitDsp(); |
||||
SharpYuvInitGammaTables(); |
||||
|
||||
sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo; |
||||
UNLOCK_ACCESS_AND_RETURN; |
||||
} |
||||
|
||||
int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr, |
||||
int rgb_step, int rgb_stride, int rgb_bit_depth, |
||||
void* y_ptr, int y_stride, void* u_ptr, int u_stride, |
||||
void* v_ptr, int v_stride, int yuv_bit_depth, int width, |
||||
int height, const SharpYuvConversionMatrix* yuv_matrix) { |
||||
SharpYuvOptions options; |
||||
options.yuv_matrix = yuv_matrix; |
||||
options.transfer_type = kSharpYuvTransferFunctionSrgb; |
||||
return SharpYuvConvertWithOptions( |
||||
r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride, |
||||
u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options); |
||||
} |
||||
|
||||
int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix, |
||||
SharpYuvOptions* options, int version) { |
||||
const int major = (version >> 24); |
||||
const int minor = (version >> 16) & 0xff; |
||||
if (options == NULL || yuv_matrix == NULL || |
||||
(major == SHARPYUV_VERSION_MAJOR && major == 0 && |
||||
minor != SHARPYUV_VERSION_MINOR) || |
||||
(major != SHARPYUV_VERSION_MAJOR)) { |
||||
return 0; |
||||
} |
||||
options->yuv_matrix = yuv_matrix; |
||||
options->transfer_type = kSharpYuvTransferFunctionSrgb; |
||||
return 1; |
||||
} |
||||
|
||||
int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr, |
||||
const void* b_ptr, int rgb_step, int rgb_stride, |
||||
int rgb_bit_depth, void* y_ptr, int y_stride, |
||||
void* u_ptr, int u_stride, void* v_ptr, |
||||
int v_stride, int yuv_bit_depth, int width, |
||||
int height, const SharpYuvOptions* options) { |
||||
const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix; |
||||
SharpYuvTransferFunctionType transfer_type = options->transfer_type; |
||||
SharpYuvConversionMatrix scaled_matrix; |
||||
const int rgb_max = (1 << rgb_bit_depth) - 1; |
||||
const int rgb_round = 1 << (rgb_bit_depth - 1); |
||||
const int yuv_max = (1 << yuv_bit_depth) - 1; |
||||
const int sfix = GetPrecisionShift(rgb_bit_depth); |
||||
|
||||
if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX || |
||||
r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL || |
||||
u_ptr == NULL || v_ptr == NULL) { |
||||
return 0; |
||||
} |
||||
if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 && |
||||
rgb_bit_depth != 16) { |
||||
return 0; |
||||
} |
||||
if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) { |
||||
return 0; |
||||
} |
||||
if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) { |
||||
// Step/stride should be even for uint16_t buffers.
|
||||
return 0; |
||||
} |
||||
if (yuv_bit_depth > 8 && |
||||
(y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) { |
||||
// Stride should be even for uint16_t buffers.
|
||||
return 0; |
||||
} |
||||
// The address of the function pointer is used to avoid a read race.
|
||||
SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo); |
||||
|
||||
// Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
|
||||
// rgb->yuv conversion matrix.
|
||||
if (rgb_bit_depth == yuv_bit_depth) { |
||||
memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix)); |
||||
} else { |
||||
int i; |
||||
for (i = 0; i < 3; ++i) { |
||||
scaled_matrix.rgb_to_y[i] = |
||||
(yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max; |
||||
scaled_matrix.rgb_to_u[i] = |
||||
(yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max; |
||||
scaled_matrix.rgb_to_v[i] = |
||||
(yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max; |
||||
} |
||||
} |
||||
// Also incorporate precision change scaling.
|
||||
scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix); |
||||
scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix); |
||||
scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix); |
||||
|
||||
return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, |
||||
rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride, |
||||
v_ptr, v_stride, yuv_bit_depth, width, height, |
||||
&scaled_matrix, transfer_type); |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
@ -0,0 +1,174 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Sharp RGB to YUV conversion.
|
||||
|
||||
#ifndef WEBP_SHARPYUV_SHARPYUV_H_ |
||||
#define WEBP_SHARPYUV_SHARPYUV_H_ |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" { |
||||
#endif |
||||
|
||||
#ifndef SHARPYUV_EXTERN |
||||
#ifdef WEBP_EXTERN |
||||
#define SHARPYUV_EXTERN WEBP_EXTERN |
||||
#else |
||||
// This explicitly marks library functions and allows for changing the
|
||||
// signature for e.g., Windows DLL builds.
|
||||
#if defined(__GNUC__) && __GNUC__ >= 4 |
||||
#define SHARPYUV_EXTERN extern __attribute__((visibility("default"))) |
||||
#else |
||||
#if defined(_MSC_VER) && defined(WEBP_DLL) |
||||
#define SHARPYUV_EXTERN __declspec(dllexport) |
||||
#else |
||||
#define SHARPYUV_EXTERN extern |
||||
#endif /* _MSC_VER && WEBP_DLL */ |
||||
#endif /* __GNUC__ >= 4 */ |
||||
#endif /* WEBP_EXTERN */ |
||||
#endif /* SHARPYUV_EXTERN */ |
||||
|
||||
#ifndef SHARPYUV_INLINE |
||||
#ifdef WEBP_INLINE |
||||
#define SHARPYUV_INLINE WEBP_INLINE |
||||
#else |
||||
#ifndef _MSC_VER |
||||
#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \ |
||||
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) |
||||
#define SHARPYUV_INLINE inline |
||||
#else |
||||
#define SHARPYUV_INLINE |
||||
#endif |
||||
#else |
||||
#define SHARPYUV_INLINE __forceinline |
||||
#endif /* _MSC_VER */ |
||||
#endif /* WEBP_INLINE */ |
||||
#endif /* SHARPYUV_INLINE */ |
||||
|
||||
// SharpYUV API version following the convention from semver.org
|
||||
#define SHARPYUV_VERSION_MAJOR 0 |
||||
#define SHARPYUV_VERSION_MINOR 4 |
||||
#define SHARPYUV_VERSION_PATCH 0 |
||||
// Version as a uint32_t. The major number is the high 8 bits.
|
||||
// The minor number is the middle 8 bits. The patch number is the low 16 bits.
|
||||
#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \ |
||||
(((MAJOR) << 24) | ((MINOR) << 16) | (PATCH)) |
||||
#define SHARPYUV_VERSION \ |
||||
SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
|
||||
SHARPYUV_VERSION_PATCH) |
||||
|
||||
// Returns the library's version number, packed in hexadecimal. See
|
||||
// SHARPYUV_VERSION.
|
||||
SHARPYUV_EXTERN int SharpYuvGetVersion(void); |
||||
|
||||
// RGB to YUV conversion matrix, in 16 bit fixed point.
|
||||
// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
|
||||
// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
|
||||
// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
|
||||
// Then y, u and v values are divided by 1<<16 and rounded.
|
||||
typedef struct { |
||||
int rgb_to_y[4]; |
||||
int rgb_to_u[4]; |
||||
int rgb_to_v[4]; |
||||
} SharpYuvConversionMatrix; |
||||
|
||||
typedef struct SharpYuvOptions SharpYuvOptions; |
||||
|
||||
// Enums for transfer functions, as defined in H.273,
|
||||
// https://www.itu.int/rec/T-REC-H.273-202107-I/en
|
||||
typedef enum SharpYuvTransferFunctionType { |
||||
// 0 is reserved
|
||||
kSharpYuvTransferFunctionBt709 = 1, |
||||
// 2 is unspecified
|
||||
// 3 is reserved
|
||||
kSharpYuvTransferFunctionBt470M = 4, |
||||
kSharpYuvTransferFunctionBt470Bg = 5, |
||||
kSharpYuvTransferFunctionBt601 = 6, |
||||
kSharpYuvTransferFunctionSmpte240 = 7, |
||||
kSharpYuvTransferFunctionLinear = 8, |
||||
kSharpYuvTransferFunctionLog100 = 9, |
||||
kSharpYuvTransferFunctionLog100_Sqrt10 = 10, |
||||
kSharpYuvTransferFunctionIec61966 = 11, |
||||
kSharpYuvTransferFunctionBt1361 = 12, |
||||
kSharpYuvTransferFunctionSrgb = 13, |
||||
kSharpYuvTransferFunctionBt2020_10Bit = 14, |
||||
kSharpYuvTransferFunctionBt2020_12Bit = 15, |
||||
kSharpYuvTransferFunctionSmpte2084 = 16, // PQ
|
||||
kSharpYuvTransferFunctionSmpte428 = 17, |
||||
kSharpYuvTransferFunctionHlg = 18, |
||||
kSharpYuvTransferFunctionNum |
||||
} SharpYuvTransferFunctionType; |
||||
|
||||
// Converts RGB to YUV420 using a downsampling algorithm that minimizes
|
||||
// artefacts caused by chroma subsampling.
|
||||
// This is slower than standard downsampling (averaging of 4 UV values).
|
||||
// Assumes that the image will be upsampled using a bilinear filter. If nearest
|
||||
// neighbor is used instead, the upsampled image might look worse than with
|
||||
// standard downsampling.
|
||||
// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
|
||||
// to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
|
||||
// rgb_step: distance in bytes between two horizontally adjacent pixels on the
|
||||
// r, g and b channels. If rgb_bit_depth is > 8, it should be a
|
||||
// multiple of 2.
|
||||
// rgb_stride: distance in bytes between two vertically adjacent pixels on the
|
||||
// r, g, and b channels. If rgb_bit_depth is > 8, it should be a
|
||||
// multiple of 2.
|
||||
// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
|
||||
// Note: 16 bit input is truncated to 14 bits before conversion to yuv.
|
||||
// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
|
||||
// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels. Should
|
||||
// point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
|
||||
// otherwise.
|
||||
// y_stride, u_stride, v_stride: distance in bytes between two vertically
|
||||
// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
|
||||
// should be multiples of 2.
|
||||
// width, height: width and height of the image in pixels
|
||||
// This function calls SharpYuvConvertWithOptions with a default transfer
|
||||
// function of kSharpYuvTransferFunctionSrgb.
|
||||
SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr, |
||||
const void* b_ptr, int rgb_step, |
||||
int rgb_stride, int rgb_bit_depth, |
||||
void* y_ptr, int y_stride, void* u_ptr, |
||||
int u_stride, void* v_ptr, int v_stride, |
||||
int yuv_bit_depth, int width, int height, |
||||
const SharpYuvConversionMatrix* yuv_matrix); |
||||
|
||||
struct SharpYuvOptions { |
||||
// This matrix cannot be NULL and can be initialized by
|
||||
// SharpYuvComputeConversionMatrix.
|
||||
const SharpYuvConversionMatrix* yuv_matrix; |
||||
SharpYuvTransferFunctionType transfer_type; |
||||
}; |
||||
|
||||
// Internal, version-checked, entry point
|
||||
SHARPYUV_EXTERN int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix*, |
||||
SharpYuvOptions*, int); |
||||
|
||||
// Should always be called, to initialize a fresh SharpYuvOptions
|
||||
// structure before modification. SharpYuvOptionsInit() must have succeeded
|
||||
// before using the 'options' object.
|
||||
static SHARPYUV_INLINE int SharpYuvOptionsInit( |
||||
const SharpYuvConversionMatrix* yuv_matrix, SharpYuvOptions* options) { |
||||
return SharpYuvOptionsInitInternal(yuv_matrix, options, SHARPYUV_VERSION); |
||||
} |
||||
|
||||
SHARPYUV_EXTERN int SharpYuvConvertWithOptions( |
||||
const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step, |
||||
int rgb_stride, int rgb_bit_depth, void* y_ptr, int y_stride, void* u_ptr, |
||||
int u_stride, void* v_ptr, int v_stride, int yuv_bit_depth, int width, |
||||
int height, const SharpYuvOptions* options); |
||||
|
||||
// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
|
||||
// support (it's rarely used in practice, especially for images).
|
||||
|
||||
#ifdef __cplusplus |
||||
} // extern "C"
|
||||
#endif |
||||
|
||||
#endif // WEBP_SHARPYUV_SHARPYUV_H_
|
@ -0,0 +1,14 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
#include "sharpyuv/sharpyuv_cpu.h" |
||||
|
||||
// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
|
||||
// function pointer is renamed in sharpyuv_cpu.h.
|
||||
#include "src/dsp/cpu.c" |
@ -0,0 +1,22 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_ |
||||
#define WEBP_SHARPYUV_SHARPYUV_CPU_H_ |
||||
|
||||
#include "sharpyuv/sharpyuv.h" |
||||
|
||||
// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
|
||||
// SharpYuvInit() replaces the use of the function pointer.
|
||||
#undef WEBP_EXTERN |
||||
#define WEBP_EXTERN extern |
||||
#define VP8GetCPUInfo SharpYuvGetCPUInfo |
||||
#include "src/dsp/cpu.h" |
||||
|
||||
#endif // WEBP_SHARPYUV_SHARPYUV_CPU_H_
|
@ -0,0 +1,110 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Colorspace utilities.
|
||||
|
||||
#include "sharpyuv/sharpyuv_csp.h" |
||||
|
||||
#include <assert.h> |
||||
#include <math.h> |
||||
#include <stddef.h> |
||||
|
||||
static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); } |
||||
|
||||
void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space, |
||||
SharpYuvConversionMatrix* matrix) { |
||||
const float kr = yuv_color_space->kr; |
||||
const float kb = yuv_color_space->kb; |
||||
const float kg = 1.0f - kr - kb; |
||||
const float cr = 0.5f / (1.0f - kb); |
||||
const float cb = 0.5f / (1.0f - kr); |
||||
|
||||
const int shift = yuv_color_space->bit_depth - 8; |
||||
|
||||
const float denom = (float)((1 << yuv_color_space->bit_depth) - 1); |
||||
float scale_y = 1.0f; |
||||
float add_y = 0.0f; |
||||
float scale_u = cr; |
||||
float scale_v = cb; |
||||
float add_uv = (float)(128 << shift); |
||||
assert(yuv_color_space->bit_depth >= 8); |
||||
|
||||
if (yuv_color_space->range == kSharpYuvRangeLimited) { |
||||
scale_y *= (219 << shift) / denom; |
||||
scale_u *= (224 << shift) / denom; |
||||
scale_v *= (224 << shift) / denom; |
||||
add_y = (float)(16 << shift); |
||||
} |
||||
|
||||
matrix->rgb_to_y[0] = ToFixed16(kr * scale_y); |
||||
matrix->rgb_to_y[1] = ToFixed16(kg * scale_y); |
||||
matrix->rgb_to_y[2] = ToFixed16(kb * scale_y); |
||||
matrix->rgb_to_y[3] = ToFixed16(add_y); |
||||
|
||||
matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u); |
||||
matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u); |
||||
matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u); |
||||
matrix->rgb_to_u[3] = ToFixed16(add_uv); |
||||
|
||||
matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v); |
||||
matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v); |
||||
matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v); |
||||
matrix->rgb_to_v[3] = ToFixed16(add_uv); |
||||
} |
||||
|
||||
// Matrices are in YUV_FIX fixed point precision.
|
||||
// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
|
||||
static const SharpYuvConversionMatrix kWebpMatrix = { |
||||
{16839, 33059, 6420, 16 << 16}, |
||||
{-9719, -19081, 28800, 128 << 16}, |
||||
{28800, -24116, -4684, 128 << 16}, |
||||
}; |
||||
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
|
||||
static const SharpYuvConversionMatrix kRec601LimitedMatrix = { |
||||
{16829, 33039, 6416, 16 << 16}, |
||||
{-9714, -19071, 28784, 128 << 16}, |
||||
{28784, -24103, -4681, 128 << 16}, |
||||
}; |
||||
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
|
||||
static const SharpYuvConversionMatrix kRec601FullMatrix = { |
||||
{19595, 38470, 7471, 0}, |
||||
{-11058, -21710, 32768, 128 << 16}, |
||||
{32768, -27439, -5329, 128 << 16}, |
||||
}; |
||||
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
|
||||
static const SharpYuvConversionMatrix kRec709LimitedMatrix = { |
||||
{11966, 40254, 4064, 16 << 16}, |
||||
{-6596, -22189, 28784, 128 << 16}, |
||||
{28784, -26145, -2639, 128 << 16}, |
||||
}; |
||||
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
|
||||
static const SharpYuvConversionMatrix kRec709FullMatrix = { |
||||
{13933, 46871, 4732, 0}, |
||||
{-7509, -25259, 32768, 128 << 16}, |
||||
{32768, -29763, -3005, 128 << 16}, |
||||
}; |
||||
|
||||
const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix( |
||||
SharpYuvMatrixType matrix_type) { |
||||
switch (matrix_type) { |
||||
case kSharpYuvMatrixWebp: |
||||
return &kWebpMatrix; |
||||
case kSharpYuvMatrixRec601Limited: |
||||
return &kRec601LimitedMatrix; |
||||
case kSharpYuvMatrixRec601Full: |
||||
return &kRec601FullMatrix; |
||||
case kSharpYuvMatrixRec709Limited: |
||||
return &kRec709LimitedMatrix; |
||||
case kSharpYuvMatrixRec709Full: |
||||
return &kRec709FullMatrix; |
||||
case kSharpYuvMatrixNum: |
||||
return NULL; |
||||
} |
||||
return NULL; |
||||
} |
@ -0,0 +1,60 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Colorspace utilities.
|
||||
|
||||
#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_ |
||||
#define WEBP_SHARPYUV_SHARPYUV_CSP_H_ |
||||
|
||||
#include "sharpyuv/sharpyuv.h" |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" { |
||||
#endif |
||||
|
||||
// Range of YUV values.
|
||||
typedef enum { |
||||
kSharpYuvRangeFull, // YUV values between [0;255] (for 8 bit)
|
||||
kSharpYuvRangeLimited // Y in [16;235], YUV in [16;240] (for 8 bit)
|
||||
} SharpYuvRange; |
||||
|
||||
// Constants that define a YUV color space.
|
||||
typedef struct { |
||||
// Kr and Kb are defined such that:
|
||||
// Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
|
||||
float kr; |
||||
float kb; |
||||
int bit_depth; // 8, 10 or 12
|
||||
SharpYuvRange range; |
||||
} SharpYuvColorSpace; |
||||
|
||||
// Fills in 'matrix' for the given YUVColorSpace.
|
||||
SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix( |
||||
const SharpYuvColorSpace* yuv_color_space, |
||||
SharpYuvConversionMatrix* matrix); |
||||
|
||||
// Enums for precomputed conversion matrices.
|
||||
typedef enum { |
||||
kSharpYuvMatrixWebp = 0, |
||||
kSharpYuvMatrixRec601Limited, |
||||
kSharpYuvMatrixRec601Full, |
||||
kSharpYuvMatrixRec709Limited, |
||||
kSharpYuvMatrixRec709Full, |
||||
kSharpYuvMatrixNum |
||||
} SharpYuvMatrixType; |
||||
|
||||
// Returns a pointer to a matrix for one of the predefined colorspaces.
|
||||
SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix( |
||||
SharpYuvMatrixType matrix_type); |
||||
|
||||
#ifdef __cplusplus |
||||
} // extern "C"
|
||||
#endif |
||||
|
||||
#endif // WEBP_SHARPYUV_SHARPYUV_CSP_H_
|
@ -0,0 +1,104 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Speed-critical functions for Sharp YUV.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "sharpyuv/sharpyuv_dsp.h" |
||||
|
||||
#include <assert.h> |
||||
#include <stdlib.h> |
||||
|
||||
#include "sharpyuv/sharpyuv_cpu.h" |
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE |
||||
static uint16_t clip(int v, int max) { |
||||
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; |
||||
} |
||||
|
||||
static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src, |
||||
uint16_t* dst, int len, int bit_depth) { |
||||
uint64_t diff = 0; |
||||
int i; |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
for (i = 0; i < len; ++i) { |
||||
const int diff_y = ref[i] - src[i]; |
||||
const int new_y = (int)dst[i] + diff_y; |
||||
dst[i] = clip(new_y, max_y); |
||||
diff += (uint64_t)abs(diff_y); |
||||
} |
||||
return diff; |
||||
} |
||||
|
||||
static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src, |
||||
int16_t* dst, int len) { |
||||
int i; |
||||
for (i = 0; i < len; ++i) { |
||||
const int diff_uv = ref[i] - src[i]; |
||||
dst[i] += diff_uv; |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len, |
||||
const uint16_t* best_y, uint16_t* out, |
||||
int bit_depth) { |
||||
int i; |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
for (i = 0; i < len; ++i, ++A, ++B) { |
||||
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; |
||||
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; |
||||
out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y); |
||||
out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y); |
||||
} |
||||
} |
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref, |
||||
uint16_t* dst, int len, int bit_depth); |
||||
void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst, |
||||
int len); |
||||
void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, |
||||
const uint16_t* best_y, uint16_t* out, |
||||
int bit_depth); |
||||
|
||||
extern VP8CPUInfo SharpYuvGetCPUInfo; |
||||
extern void InitSharpYuvSSE2(void); |
||||
extern void InitSharpYuvNEON(void); |
||||
|
||||
void SharpYuvInitDsp(void) { |
||||
#if !WEBP_NEON_OMIT_C_CODE |
||||
SharpYuvUpdateY = SharpYuvUpdateY_C; |
||||
SharpYuvUpdateRGB = SharpYuvUpdateRGB_C; |
||||
SharpYuvFilterRow = SharpYuvFilterRow_C; |
||||
#endif |
||||
|
||||
if (SharpYuvGetCPUInfo != NULL) { |
||||
#if defined(WEBP_HAVE_SSE2) |
||||
if (SharpYuvGetCPUInfo(kSSE2)) { |
||||
InitSharpYuvSSE2(); |
||||
} |
||||
#endif // WEBP_HAVE_SSE2
|
||||
} |
||||
|
||||
#if defined(WEBP_HAVE_NEON) |
||||
if (WEBP_NEON_OMIT_C_CODE || |
||||
(SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) { |
||||
InitSharpYuvNEON(); |
||||
} |
||||
#endif // WEBP_HAVE_NEON
|
||||
|
||||
assert(SharpYuvUpdateY != NULL); |
||||
assert(SharpYuvUpdateRGB != NULL); |
||||
assert(SharpYuvFilterRow != NULL); |
||||
} |
@ -0,0 +1,28 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Speed-critical functions for Sharp YUV.
|
||||
|
||||
#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_ |
||||
#define WEBP_SHARPYUV_SHARPYUV_DSP_H_ |
||||
|
||||
#include "sharpyuv/sharpyuv_cpu.h" |
||||
#include "src/webp/types.h" |
||||
|
||||
extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref, |
||||
uint16_t* dst, int len, int bit_depth); |
||||
extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, |
||||
int16_t* dst, int len); |
||||
extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, |
||||
const uint16_t* best_y, uint16_t* out, |
||||
int bit_depth); |
||||
|
||||
void SharpYuvInitDsp(void); |
||||
|
||||
#endif // WEBP_SHARPYUV_SHARPYUV_DSP_H_
|
@ -0,0 +1,419 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Gamma correction utilities.
|
||||
|
||||
#include "sharpyuv/sharpyuv_gamma.h" |
||||
|
||||
#include <assert.h> |
||||
#include <float.h> |
||||
#include <math.h> |
||||
|
||||
#include "src/webp/types.h" |
||||
|
||||
// Gamma correction compensates loss of resolution during chroma subsampling.
|
||||
// Size of pre-computed table for converting from gamma to linear.
|
||||
#define GAMMA_TO_LINEAR_TAB_BITS 10 |
||||
#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS) |
||||
static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2]; |
||||
#define LINEAR_TO_GAMMA_TAB_BITS 9 |
||||
#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS) |
||||
static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2]; |
||||
|
||||
static const double kGammaF = 1. / 0.45; |
||||
#define GAMMA_TO_LINEAR_BITS 16 |
||||
|
||||
static volatile int kGammaTablesSOk = 0; |
||||
void SharpYuvInitGammaTables(void) { |
||||
assert(GAMMA_TO_LINEAR_BITS <= 16); |
||||
if (!kGammaTablesSOk) { |
||||
int v; |
||||
const double a = 0.09929682680944; |
||||
const double thresh = 0.018053968510807; |
||||
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS; |
||||
// Precompute gamma to linear table.
|
||||
{ |
||||
const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE; |
||||
const double a_rec = 1. / (1. + a); |
||||
for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) { |
||||
const double g = norm * v; |
||||
double value; |
||||
if (g <= thresh * 4.5) { |
||||
value = g / 4.5; |
||||
} else { |
||||
value = pow(a_rec * (g + a), kGammaF); |
||||
} |
||||
kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5); |
||||
} |
||||
// to prevent small rounding errors to cause read-overflow:
|
||||
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] = |
||||
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE]; |
||||
} |
||||
// Precompute linear to gamma table.
|
||||
{ |
||||
const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE; |
||||
for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) { |
||||
const double g = scale * v; |
||||
double value; |
||||
if (g <= thresh) { |
||||
value = 4.5 * g; |
||||
} else { |
||||
value = (1. + a) * pow(g, 1. / kGammaF) - a; |
||||
} |
||||
kLinearToGammaTabS[v] = |
||||
(uint32_t)(final_scale * value + 0.5); |
||||
} |
||||
// to prevent small rounding errors to cause read-overflow:
|
||||
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] = |
||||
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE]; |
||||
} |
||||
kGammaTablesSOk = 1; |
||||
} |
||||
} |
||||
|
||||
static WEBP_INLINE int Shift(int v, int shift) { |
||||
return (shift >= 0) ? (v << shift) : (v >> -shift); |
||||
} |
||||
|
||||
static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab, |
||||
int tab_pos_shift_right, |
||||
int tab_value_shift) { |
||||
const uint32_t tab_pos = Shift(v, -tab_pos_shift_right); |
||||
// fractional part, in 'tab_pos_shift' fixed-point precision
|
||||
const uint32_t x = v - (tab_pos << tab_pos_shift_right); // fractional part
|
||||
// v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
|
||||
const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift); |
||||
const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift); |
||||
// Final interpolation.
|
||||
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
|
||||
const int half = |
||||
(tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0; |
||||
const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right); |
||||
return result; |
||||
} |
||||
|
||||
static uint32_t ToLinearSrgb(uint16_t v, int bit_depth) { |
||||
const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth; |
||||
if (shift > 0) { |
||||
return kGammaToLinearTabS[v << shift]; |
||||
} |
||||
return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0); |
||||
} |
||||
|
||||
static uint16_t FromLinearSrgb(uint32_t value, int bit_depth) { |
||||
return FixedPointInterpolation( |
||||
value, kLinearToGammaTabS, |
||||
(GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS), |
||||
bit_depth - GAMMA_TO_LINEAR_BITS); |
||||
} |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CLAMP(x, low, high) \ |
||||
(((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x))) |
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b)) |
||||
#define MAX(a, b) (((a) > (b)) ? (a) : (b)) |
||||
|
||||
static WEBP_INLINE float Roundf(float x) { |
||||
if (x < 0) |
||||
return (float)ceil((double)(x - 0.5f)); |
||||
else |
||||
return (float)floor((double)(x + 0.5f)); |
||||
} |
||||
|
||||
static WEBP_INLINE float Powf(float base, float exp) { |
||||
return (float)pow((double)base, (double)exp); |
||||
} |
||||
|
||||
static WEBP_INLINE float Log10f(float x) { return (float)log10((double)x); } |
||||
|
||||
static float ToLinear709(float gamma) { |
||||
if (gamma < 0.f) { |
||||
return 0.f; |
||||
} else if (gamma < 4.5f * 0.018053968510807f) { |
||||
return gamma / 4.5f; |
||||
} else if (gamma < 1.f) { |
||||
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f); |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float FromLinear709(float linear) { |
||||
if (linear < 0.f) { |
||||
return 0.f; |
||||
} else if (linear < 0.018053968510807f) { |
||||
return linear * 4.5f; |
||||
} else if (linear < 1.f) { |
||||
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f; |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float ToLinear470M(float gamma) { |
||||
return Powf(CLAMP(gamma, 0.f, 1.f), 1.f / 2.2f); |
||||
} |
||||
|
||||
static float FromLinear470M(float linear) { |
||||
return Powf(CLAMP(linear, 0.f, 1.f), 2.2f); |
||||
} |
||||
|
||||
static float ToLinear470Bg(float gamma) { |
||||
return Powf(CLAMP(gamma, 0.f, 1.f), 1.f / 2.8f); |
||||
} |
||||
|
||||
static float FromLinear470Bg(float linear) { |
||||
return Powf(CLAMP(linear, 0.f, 1.f), 2.8f); |
||||
} |
||||
|
||||
static float ToLinearSmpte240(float gamma) { |
||||
if (gamma < 0.f) { |
||||
return 0.f; |
||||
} else if (gamma < 4.f * 0.022821585529445f) { |
||||
return gamma / 4.f; |
||||
} else if (gamma < 1.f) { |
||||
return Powf((gamma + 0.111572195921731f) / 1.111572195921731f, 1.f / 0.45f); |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float FromLinearSmpte240(float linear) { |
||||
if (linear < 0.f) { |
||||
return 0.f; |
||||
} else if (linear < 0.022821585529445f) { |
||||
return linear * 4.f; |
||||
} else if (linear < 1.f) { |
||||
return 1.111572195921731f * Powf(linear, 0.45f) - 0.111572195921731f; |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float ToLinearLog100(float gamma) { |
||||
return (gamma < 0.01f) ? 0.0f : 1.0f + Log10f(MIN(gamma, 1.f)) / 2.0f; |
||||
} |
||||
|
||||
static float FromLinearLog100(float linear) { |
||||
// The function is non-bijective so choose the middle of [0, 0.01].
|
||||
const float mid_interval = 0.01f / 2.f; |
||||
return (linear <= 0.0f) ? mid_interval |
||||
: Powf(10.0f, 2.f * (MIN(linear, 1.f) - 1.0f)); |
||||
} |
||||
|
||||
static float ToLinearLog100Sqrt10(float gamma) { |
||||
return (gamma < 0.00316227766f) ? 0.0f |
||||
: 1.0f + Log10f(MIN(gamma, 1.f)) / 2.5f; |
||||
} |
||||
|
||||
static float FromLinearLog100Sqrt10(float linear) { |
||||
// The function is non-bijective so choose the middle of [0, 0.00316227766f[.
|
||||
const float mid_interval = 0.00316227766f / 2.f; |
||||
return (linear < 0.0f) ? mid_interval |
||||
: Powf(10.0f, 2.5f * (MIN(linear, 1.f) - 1.0f)); |
||||
} |
||||
|
||||
static float ToLinearIec61966(float gamma) { |
||||
if (gamma <= -4.5f * 0.018053968510807f) { |
||||
return Powf((-gamma + 0.09929682680944f) / -1.09929682680944f, 1.f / 0.45f); |
||||
} else if (gamma < 4.5f * 0.018053968510807f) { |
||||
return gamma / 4.5f; |
||||
} |
||||
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f); |
||||
} |
||||
|
||||
static float FromLinearIec61966(float linear) { |
||||
if (linear <= -0.018053968510807f) { |
||||
return -1.09929682680944f * Powf(-linear, 0.45f) + 0.09929682680944f; |
||||
} else if (linear < 0.018053968510807f) { |
||||
return linear * 4.5f; |
||||
} |
||||
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f; |
||||
} |
||||
|
||||
static float ToLinearBt1361(float gamma) { |
||||
if (gamma < -0.25f) { |
||||
return -0.25f; |
||||
} else if (gamma < 0.f) { |
||||
return Powf((gamma - 0.02482420670236f) / -0.27482420670236f, 1.f / 0.45f) / |
||||
-4.f; |
||||
} else if (gamma < 4.5f * 0.018053968510807f) { |
||||
return gamma / 4.5f; |
||||
} else if (gamma < 1.f) { |
||||
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f); |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float FromLinearBt1361(float linear) { |
||||
if (linear < -0.25f) { |
||||
return -0.25f; |
||||
} else if (linear < 0.f) { |
||||
return -0.27482420670236f * Powf(-4.f * linear, 0.45f) + 0.02482420670236f; |
||||
} else if (linear < 0.018053968510807f) { |
||||
return linear * 4.5f; |
||||
} else if (linear < 1.f) { |
||||
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f; |
||||
} |
||||
return 1.f; |
||||
} |
||||
|
||||
static float ToLinearPq(float gamma) { |
||||
if (gamma > 0.f) { |
||||
const float pow_gamma = Powf(gamma, 32.f / 2523.f); |
||||
const float num = MAX(pow_gamma - 107.f / 128.f, 0.0f); |
||||
const float den = MAX(2413.f / 128.f - 2392.f / 128.f * pow_gamma, FLT_MIN); |
||||
return Powf(num / den, 4096.f / 653.f); |
||||
} |
||||
return 0.f; |
||||
} |
||||
|
||||
static float FromLinearPq(float linear) { |
||||
if (linear > 0.f) { |
||||
const float pow_linear = Powf(linear, 653.f / 4096.f); |
||||
const float num = 107.f / 128.f + 2413.f / 128.f * pow_linear; |
||||
const float den = 1.0f + 2392.f / 128.f * pow_linear; |
||||
return Powf(num / den, 2523.f / 32.f); |
||||
} |
||||
return 0.f; |
||||
} |
||||
|
||||
static float ToLinearSmpte428(float gamma) { |
||||
return Powf(0.91655527974030934f * MAX(gamma, 0.f), 1.f / 2.6f); |
||||
} |
||||
|
||||
static float FromLinearSmpte428(float linear) { |
||||
return Powf(MAX(linear, 0.f), 2.6f) / 0.91655527974030934f; |
||||
} |
||||
|
||||
// Conversion in BT.2100 requires RGB info. Simplify to gamma correction here.
|
||||
static float ToLinearHlg(float gamma) { |
||||
if (gamma < 0.f) { |
||||
return 0.f; |
||||
} else if (gamma <= 0.5f) { |
||||
return Powf((gamma * gamma) * (1.f / 3.f), 1.2f); |
||||
} |
||||
return Powf((expf((gamma - 0.55991073f) / 0.17883277f) + 0.28466892f) / 12.0f, |
||||
1.2f); |
||||
} |
||||
|
||||
static float FromLinearHlg(float linear) { |
||||
linear = Powf(linear, 1.f / 1.2f); |
||||
if (linear < 0.f) { |
||||
return 0.f; |
||||
} else if (linear <= (1.f / 12.f)) { |
||||
return sqrtf(3.f * linear); |
||||
} |
||||
return 0.17883277f * logf(12.f * linear - 0.28466892f) + 0.55991073f; |
||||
} |
||||
|
||||
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
float v_float, linear; |
||||
if (transfer_type == kSharpYuvTransferFunctionSrgb) { |
||||
return ToLinearSrgb(v, bit_depth); |
||||
} |
||||
v_float = (float)v / ((1 << bit_depth) - 1); |
||||
switch (transfer_type) { |
||||
case kSharpYuvTransferFunctionBt709: |
||||
case kSharpYuvTransferFunctionBt601: |
||||
case kSharpYuvTransferFunctionBt2020_10Bit: |
||||
case kSharpYuvTransferFunctionBt2020_12Bit: |
||||
linear = ToLinear709(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt470M: |
||||
linear = ToLinear470M(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt470Bg: |
||||
linear = ToLinear470Bg(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte240: |
||||
linear = ToLinearSmpte240(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionLinear: |
||||
return v; |
||||
case kSharpYuvTransferFunctionLog100: |
||||
linear = ToLinearLog100(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionLog100_Sqrt10: |
||||
linear = ToLinearLog100Sqrt10(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionIec61966: |
||||
linear = ToLinearIec61966(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt1361: |
||||
linear = ToLinearBt1361(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte2084: |
||||
linear = ToLinearPq(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte428: |
||||
linear = ToLinearSmpte428(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionHlg: |
||||
linear = ToLinearHlg(v_float); |
||||
break; |
||||
default: |
||||
assert(0); |
||||
linear = 0; |
||||
break; |
||||
} |
||||
return (uint32_t)Roundf(linear * ((1 << 16) - 1)); |
||||
} |
||||
|
||||
uint16_t SharpYuvLinearToGamma(uint32_t v, int bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type) { |
||||
float v_float, linear; |
||||
if (transfer_type == kSharpYuvTransferFunctionSrgb) { |
||||
return FromLinearSrgb(v, bit_depth); |
||||
} |
||||
v_float = (float)v / ((1 << 16) - 1); |
||||
switch (transfer_type) { |
||||
case kSharpYuvTransferFunctionBt709: |
||||
case kSharpYuvTransferFunctionBt601: |
||||
case kSharpYuvTransferFunctionBt2020_10Bit: |
||||
case kSharpYuvTransferFunctionBt2020_12Bit: |
||||
linear = FromLinear709(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt470M: |
||||
linear = FromLinear470M(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt470Bg: |
||||
linear = FromLinear470Bg(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte240: |
||||
linear = FromLinearSmpte240(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionLinear: |
||||
return v; |
||||
case kSharpYuvTransferFunctionLog100: |
||||
linear = FromLinearLog100(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionLog100_Sqrt10: |
||||
linear = FromLinearLog100Sqrt10(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionIec61966: |
||||
linear = FromLinearIec61966(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionBt1361: |
||||
linear = FromLinearBt1361(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte2084: |
||||
linear = FromLinearPq(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionSmpte428: |
||||
linear = FromLinearSmpte428(v_float); |
||||
break; |
||||
case kSharpYuvTransferFunctionHlg: |
||||
linear = FromLinearHlg(v_float); |
||||
break; |
||||
default: |
||||
assert(0); |
||||
linear = 0; |
||||
break; |
||||
} |
||||
return (uint16_t)Roundf(linear * ((1 << bit_depth) - 1)); |
||||
} |
@ -0,0 +1,38 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Gamma correction utilities.
|
||||
|
||||
#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_ |
||||
#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_ |
||||
|
||||
#include "sharpyuv/sharpyuv.h" |
||||
#include "src/webp/types.h" |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" { |
||||
#endif |
||||
|
||||
// Initializes precomputed tables. Must be called once before calling
|
||||
// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
|
||||
void SharpYuvInitGammaTables(void); |
||||
|
||||
// Converts a 'bit_depth'-bit gamma color value to a 16-bit linear value.
|
||||
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type); |
||||
|
||||
// Converts a 16-bit linear color value to a 'bit_depth'-bit gamma value.
|
||||
uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth, |
||||
SharpYuvTransferFunctionType transfer_type); |
||||
|
||||
#ifdef __cplusplus |
||||
} // extern "C"
|
||||
#endif |
||||
|
||||
#endif // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
|
@ -0,0 +1,181 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Speed-critical functions for Sharp YUV.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "sharpyuv/sharpyuv_dsp.h" |
||||
|
||||
#if defined(WEBP_USE_NEON) |
||||
#include <assert.h> |
||||
#include <stdlib.h> |
||||
#include <arm_neon.h> |
||||
|
||||
static uint16_t clip_NEON(int v, int max) { |
||||
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; |
||||
} |
||||
|
||||
static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src, |
||||
uint16_t* dst, int len, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
int i; |
||||
const int16x8_t zero = vdupq_n_s16(0); |
||||
const int16x8_t max = vdupq_n_s16(max_y); |
||||
uint64x2_t sum = vdupq_n_u64(0); |
||||
uint64_t diff; |
||||
|
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); |
||||
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); |
||||
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); |
||||
const int16x8_t D = vsubq_s16(A, B); // diff_y
|
||||
const int16x8_t F = vaddq_s16(C, D); // new_y
|
||||
const uint16x8_t H = |
||||
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); |
||||
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
|
||||
vst1q_u16(dst + i, H); |
||||
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); |
||||
} |
||||
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); |
||||
for (; i < len; ++i) { |
||||
const int diff_y = ref[i] - src[i]; |
||||
const int new_y = (int)(dst[i]) + diff_y; |
||||
dst[i] = clip_NEON(new_y, max_y); |
||||
diff += (uint64_t)(abs(diff_y)); |
||||
} |
||||
return diff; |
||||
} |
||||
|
||||
static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src, |
||||
int16_t* dst, int len) { |
||||
int i; |
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const int16x8_t A = vld1q_s16(ref + i); |
||||
const int16x8_t B = vld1q_s16(src + i); |
||||
const int16x8_t C = vld1q_s16(dst + i); |
||||
const int16x8_t D = vsubq_s16(A, B); // diff_uv
|
||||
const int16x8_t E = vaddq_s16(C, D); // new_uv
|
||||
vst1q_s16(dst + i, E); |
||||
} |
||||
for (; i < len; ++i) { |
||||
const int diff_uv = ref[i] - src[i]; |
||||
dst[i] += diff_uv; |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B, |
||||
int len, const uint16_t* best_y, |
||||
uint16_t* out, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
int i; |
||||
const int16x8_t max = vdupq_n_s16(max_y); |
||||
const int16x8_t zero = vdupq_n_s16(0); |
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const int16x8_t a0 = vld1q_s16(A + i + 0); |
||||
const int16x8_t a1 = vld1q_s16(A + i + 1); |
||||
const int16x8_t b0 = vld1q_s16(B + i + 0); |
||||
const int16x8_t b1 = vld1q_s16(B + i + 1); |
||||
const int16x8_t a0b1 = vaddq_s16(a0, b1); |
||||
const int16x8_t a1b0 = vaddq_s16(a1, b0); |
||||
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
|
||||
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
|
||||
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); |
||||
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); |
||||
const int16x8_t e0 = vrhaddq_s16(c1, a0); |
||||
const int16x8_t e1 = vrhaddq_s16(c0, a1); |
||||
const int16x8x2_t f = vzipq_s16(e0, e1); |
||||
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); |
||||
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); |
||||
const int16x8_t h0 = vaddq_s16(g0, f.val[0]); |
||||
const int16x8_t h1 = vaddq_s16(g1, f.val[1]); |
||||
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); |
||||
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); |
||||
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); |
||||
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); |
||||
} |
||||
for (; i < len; ++i) { |
||||
const int a0b1 = A[i + 0] + B[i + 1]; |
||||
const int a1b0 = A[i + 1] + B[i + 0]; |
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8; |
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; |
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; |
||||
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y); |
||||
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y); |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B, |
||||
int len, const uint16_t* best_y, |
||||
uint16_t* out, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
int i; |
||||
const uint16x8_t max = vdupq_n_u16(max_y); |
||||
for (i = 0; i + 4 <= len; i += 4) { |
||||
const int16x4_t a0 = vld1_s16(A + i + 0); |
||||
const int16x4_t a1 = vld1_s16(A + i + 1); |
||||
const int16x4_t b0 = vld1_s16(B + i + 0); |
||||
const int16x4_t b1 = vld1_s16(B + i + 1); |
||||
const int32x4_t a0b1 = vaddl_s16(a0, b1); |
||||
const int32x4_t a1b0 = vaddl_s16(a1, b0); |
||||
const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1); // 2*(A0+B1)
|
||||
const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0); // 2*(A1+B0)
|
||||
const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3); |
||||
const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3); |
||||
const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0)); |
||||
const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1)); |
||||
const int32x4x2_t f = vzipq_s32(e0, e1); |
||||
|
||||
const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i)); |
||||
const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g)); |
||||
const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g)); |
||||
const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1)); |
||||
const uint16x8_t i_clamped = vminq_u16(i_16, max); |
||||
vst1q_u16(out + 2 * i + 0, i_clamped); |
||||
} |
||||
for (; i < len; ++i) { |
||||
const int a0b1 = A[i + 0] + B[i + 1]; |
||||
const int a1b0 = A[i + 1] + B[i + 0]; |
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8; |
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; |
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; |
||||
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y); |
||||
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y); |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, |
||||
const uint16_t* best_y, uint16_t* out, |
||||
int bit_depth) { |
||||
if (bit_depth <= 10) { |
||||
SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth); |
||||
} else { |
||||
SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth); |
||||
} |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void InitSharpYuvNEON(void); |
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) { |
||||
SharpYuvUpdateY = SharpYuvUpdateY_NEON; |
||||
SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON; |
||||
SharpYuvFilterRow = SharpYuvFilterRow_NEON; |
||||
} |
||||
|
||||
#else // !WEBP_USE_NEON
|
||||
|
||||
extern void InitSharpYuvNEON(void); |
||||
|
||||
void InitSharpYuvNEON(void) {} |
||||
|
||||
#endif // WEBP_USE_NEON
|
@ -0,0 +1,201 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Speed-critical functions for Sharp YUV.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "sharpyuv/sharpyuv_dsp.h" |
||||
|
||||
#if defined(WEBP_USE_SSE2) |
||||
#include <stdlib.h> |
||||
#include <emmintrin.h> |
||||
|
||||
static uint16_t clip_SSE2(int v, int max) { |
||||
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; |
||||
} |
||||
|
||||
static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, |
||||
uint16_t* dst, int len, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
uint64_t diff = 0; |
||||
uint32_t tmp[4]; |
||||
int i; |
||||
const __m128i zero = _mm_setzero_si128(); |
||||
const __m128i max = _mm_set1_epi16(max_y); |
||||
const __m128i one = _mm_set1_epi16(1); |
||||
__m128i sum = zero; |
||||
|
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); |
||||
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); |
||||
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); |
||||
const __m128i D = _mm_sub_epi16(A, B); // diff_y
|
||||
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
|
||||
const __m128i F = _mm_add_epi16(C, D); // new_y
|
||||
const __m128i G = _mm_or_si128(E, one); // -1 or 1
|
||||
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); |
||||
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
|
||||
_mm_storeu_si128((__m128i*)(dst + i), H); |
||||
sum = _mm_add_epi32(sum, I); |
||||
} |
||||
_mm_storeu_si128((__m128i*)tmp, sum); |
||||
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; |
||||
for (; i < len; ++i) { |
||||
const int diff_y = ref[i] - src[i]; |
||||
const int new_y = (int)dst[i] + diff_y; |
||||
dst[i] = clip_SSE2(new_y, max_y); |
||||
diff += (uint64_t)abs(diff_y); |
||||
} |
||||
return diff; |
||||
} |
||||
|
||||
static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, |
||||
int16_t* dst, int len) { |
||||
int i = 0; |
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); |
||||
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); |
||||
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); |
||||
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
|
||||
const __m128i E = _mm_add_epi16(C, D); // new_uv
|
||||
_mm_storeu_si128((__m128i*)(dst + i), E); |
||||
} |
||||
for (; i < len; ++i) { |
||||
const int diff_uv = ref[i] - src[i]; |
||||
dst[i] += diff_uv; |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B, |
||||
int len, const uint16_t* best_y, |
||||
uint16_t* out, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
int i; |
||||
const __m128i kCst8 = _mm_set1_epi16(8); |
||||
const __m128i max = _mm_set1_epi16(max_y); |
||||
const __m128i zero = _mm_setzero_si128(); |
||||
for (i = 0; i + 8 <= len; i += 8) { |
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); |
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); |
||||
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); |
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); |
||||
const __m128i a0b1 = _mm_add_epi16(a0, b1); |
||||
const __m128i a1b0 = _mm_add_epi16(a1, b0); |
||||
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); |
||||
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
|
||||
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
|
||||
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); |
||||
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); |
||||
const __m128i d0 = _mm_add_epi16(c1, a0); |
||||
const __m128i d1 = _mm_add_epi16(c0, a1); |
||||
const __m128i e0 = _mm_srai_epi16(d0, 1); |
||||
const __m128i e1 = _mm_srai_epi16(d1, 1); |
||||
const __m128i f0 = _mm_unpacklo_epi16(e0, e1); |
||||
const __m128i f1 = _mm_unpackhi_epi16(e0, e1); |
||||
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); |
||||
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); |
||||
const __m128i h0 = _mm_add_epi16(g0, f0); |
||||
const __m128i h1 = _mm_add_epi16(g1, f1); |
||||
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); |
||||
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); |
||||
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); |
||||
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); |
||||
} |
||||
for (; i < len; ++i) { |
||||
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
||||
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
||||
// We reuse the common sub-expressions.
|
||||
const int a0b1 = A[i + 0] + B[i + 1]; |
||||
const int a1b0 = A[i + 1] + B[i + 0]; |
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8; |
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; |
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; |
||||
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); |
||||
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); |
||||
} |
||||
} |
||||
|
||||
static WEBP_INLINE __m128i s16_to_s32(__m128i in) { |
||||
return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16); |
||||
} |
||||
|
||||
static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B, |
||||
int len, const uint16_t* best_y, |
||||
uint16_t* out, int bit_depth) { |
||||
const int max_y = (1 << bit_depth) - 1; |
||||
int i; |
||||
const __m128i kCst8 = _mm_set1_epi32(8); |
||||
const __m128i max = _mm_set1_epi16(max_y); |
||||
const __m128i zero = _mm_setzero_si128(); |
||||
for (i = 0; i + 4 <= len; i += 4) { |
||||
const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0))); |
||||
const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1))); |
||||
const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0))); |
||||
const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1))); |
||||
const __m128i a0b1 = _mm_add_epi32(a0, b1); |
||||
const __m128i a1b0 = _mm_add_epi32(a1, b0); |
||||
const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8); |
||||
const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1); // 2*(A0+B1)
|
||||
const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0); // 2*(A1+B0)
|
||||
const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3); |
||||
const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3); |
||||
const __m128i d0 = _mm_add_epi32(c1, a0); |
||||
const __m128i d1 = _mm_add_epi32(c0, a1); |
||||
const __m128i e0 = _mm_srai_epi32(d0, 1); |
||||
const __m128i e1 = _mm_srai_epi32(d1, 1); |
||||
const __m128i f0 = _mm_unpacklo_epi32(e0, e1); |
||||
const __m128i f1 = _mm_unpackhi_epi32(e0, e1); |
||||
const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); |
||||
const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1)); |
||||
const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero); |
||||
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), final); |
||||
} |
||||
for (; i < len; ++i) { |
||||
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
||||
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
||||
// We reuse the common sub-expressions.
|
||||
const int a0b1 = A[i + 0] + B[i + 1]; |
||||
const int a1b0 = A[i + 1] + B[i + 0]; |
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8; |
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; |
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; |
||||
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); |
||||
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); |
||||
} |
||||
} |
||||
|
||||
static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, |
||||
const uint16_t* best_y, uint16_t* out, |
||||
int bit_depth) { |
||||
if (bit_depth <= 10) { |
||||
SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth); |
||||
} else { |
||||
SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth); |
||||
} |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void InitSharpYuvSSE2(void); |
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) { |
||||
SharpYuvUpdateY = SharpYuvUpdateY_SSE2; |
||||
SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2; |
||||
SharpYuvFilterRow = SharpYuvFilterRow_SSE2; |
||||
} |
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
extern void InitSharpYuvSSE2(void); |
||||
|
||||
void InitSharpYuvSSE2(void) {} |
||||
|
||||
#endif // WEBP_USE_SSE2
|
@ -0,0 +1,266 @@ |
||||
// Copyright 2022 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// CPU detection functions and macros.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifndef WEBP_DSP_CPU_H_ |
||||
#define WEBP_DSP_CPU_H_ |
||||
|
||||
#include <stddef.h> |
||||
|
||||
#ifdef HAVE_CONFIG_H |
||||
#include "src/webp/config.h" |
||||
#endif |
||||
|
||||
#include "src/webp/types.h" |
||||
|
||||
#if defined(__GNUC__) |
||||
#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) |
||||
#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) |
||||
#else |
||||
#define LOCAL_GCC_VERSION 0 |
||||
#define LOCAL_GCC_PREREQ(maj, min) 0 |
||||
#endif |
||||
|
||||
#if defined(__clang__) |
||||
#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) |
||||
#define LOCAL_CLANG_PREREQ(maj, min) \ |
||||
(LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) |
||||
#else |
||||
#define LOCAL_CLANG_VERSION 0 |
||||
#define LOCAL_CLANG_PREREQ(maj, min) 0 |
||||
#endif |
||||
|
||||
#ifndef __has_builtin |
||||
#define __has_builtin(x) 0 |
||||
#endif |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// x86 defines.
|
||||
|
||||
#if !defined(HAVE_CONFIG_H) |
||||
#if defined(_MSC_VER) && _MSC_VER > 1310 && \ |
||||
(defined(_M_X64) || defined(_M_IX86)) |
||||
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
|
||||
#endif |
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ |
||||
(defined(_M_X64) || defined(_M_IX86)) |
||||
#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
|
||||
#endif |
||||
#endif |
||||
|
||||
// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
|
||||
// files without intrinsics, allowing the corresponding Init() to be called.
|
||||
// Files containing intrinsics will need to be built targeting the instruction
|
||||
// set so should succeed on one of the earlier tests.
|
||||
#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ |
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) |
||||
#define WEBP_USE_SSE2 |
||||
#endif |
||||
|
||||
#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) |
||||
#define WEBP_HAVE_SSE2 |
||||
#endif |
||||
|
||||
#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ |
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) |
||||
#define WEBP_USE_SSE41 |
||||
#endif |
||||
|
||||
#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) |
||||
#define WEBP_HAVE_SSE41 |
||||
#endif |
||||
|
||||
#undef WEBP_MSC_SSE41 |
||||
#undef WEBP_MSC_SSE2 |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Arm defines.
|
||||
|
||||
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
|
||||
// inline assembly would need to be modified for use with Native Client.
|
||||
#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ |
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
|
||||
!defined(__native_client__) |
||||
#define WEBP_USE_NEON |
||||
#endif |
||||
|
||||
#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ |
||||
defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) |
||||
#define WEBP_ANDROID_NEON // Android targets that may have NEON
|
||||
#define WEBP_USE_NEON |
||||
#endif |
||||
|
||||
// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
|
||||
// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
|
||||
// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
|
||||
// vtbl4_u8(); a fix was made in 16.6.
|
||||
#if defined(_MSC_VER) && \ |
||||
((_MSC_VER >= 1700 && defined(_M_ARM)) || \
|
||||
(_MSC_VER >= 1926 && (defined(_M_ARM64) || defined(_M_ARM64EC)))) |
||||
#define WEBP_USE_NEON |
||||
#define WEBP_USE_INTRINSICS |
||||
#endif |
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) |
||||
#define WEBP_AARCH64 1 |
||||
#else |
||||
#define WEBP_AARCH64 0 |
||||
#endif |
||||
|
||||
#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) |
||||
#define WEBP_HAVE_NEON |
||||
#endif |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// MIPS defines.
|
||||
|
||||
#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \ |
||||
(__mips_isa_rev >= 1) && (__mips_isa_rev < 6) |
||||
#define WEBP_USE_MIPS32 |
||||
#if (__mips_isa_rev >= 2) |
||||
#define WEBP_USE_MIPS32_R2 |
||||
#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) |
||||
#define WEBP_USE_MIPS_DSP_R2 |
||||
#endif |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) |
||||
#define WEBP_USE_MSA |
||||
#endif |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#ifndef WEBP_DSP_OMIT_C_CODE |
||||
#define WEBP_DSP_OMIT_C_CODE 1 |
||||
#endif |
||||
|
||||
#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE |
||||
#define WEBP_NEON_OMIT_C_CODE 1 |
||||
#else |
||||
#define WEBP_NEON_OMIT_C_CODE 0 |
||||
#endif |
||||
|
||||
#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64) |
||||
#define WEBP_NEON_WORK_AROUND_GCC 1 |
||||
#else |
||||
#define WEBP_NEON_WORK_AROUND_GCC 0 |
||||
#endif |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// This macro prevents thread_sanitizer from reporting known concurrent writes.
|
||||
#define WEBP_TSAN_IGNORE_FUNCTION |
||||
#if defined(__has_feature) |
||||
#if __has_feature(thread_sanitizer) |
||||
#undef WEBP_TSAN_IGNORE_FUNCTION |
||||
#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread)) |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(__has_feature) |
||||
#if __has_feature(memory_sanitizer) |
||||
#define WEBP_MSAN |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(WEBP_USE_THREAD) && !defined(_WIN32) |
||||
#include <pthread.h> // NOLINT |
||||
|
||||
#define WEBP_DSP_INIT(func) \ |
||||
do { \
|
||||
static volatile VP8CPUInfo func##_last_cpuinfo_used = \
|
||||
(VP8CPUInfo)&func##_last_cpuinfo_used; \
|
||||
static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \
|
||||
if (pthread_mutex_lock(&func##_lock)) break; \
|
||||
if (func##_last_cpuinfo_used != VP8GetCPUInfo) func(); \
|
||||
func##_last_cpuinfo_used = VP8GetCPUInfo; \
|
||||
(void)pthread_mutex_unlock(&func##_lock); \
|
||||
} while (0) |
||||
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
|
||||
#define WEBP_DSP_INIT(func) \ |
||||
do { \
|
||||
static volatile VP8CPUInfo func##_last_cpuinfo_used = \
|
||||
(VP8CPUInfo)&func##_last_cpuinfo_used; \
|
||||
if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \
|
||||
func(); \
|
||||
func##_last_cpuinfo_used = VP8GetCPUInfo; \
|
||||
} while (0) |
||||
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
|
||||
|
||||
// Defines an Init + helper function that control multiple initialization of
|
||||
// function pointers / tables.
|
||||
/* Usage:
|
||||
WEBP_DSP_INIT_FUNC(InitFunc) { |
||||
...function body |
||||
} |
||||
*/ |
||||
#define WEBP_DSP_INIT_FUNC(name) \ |
||||
static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void); \
|
||||
WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \
|
||||
static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void) |
||||
|
||||
#define WEBP_UBSAN_IGNORE_UNDEF |
||||
#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW |
||||
#if defined(__clang__) && defined(__has_attribute) |
||||
#if __has_attribute(no_sanitize) |
||||
// This macro prevents the undefined behavior sanitizer from reporting
|
||||
// failures. This is only meant to silence unaligned loads on platforms that
|
||||
// are known to support them.
|
||||
#undef WEBP_UBSAN_IGNORE_UNDEF |
||||
#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined"))) |
||||
|
||||
// This macro prevents the undefined behavior sanitizer from reporting
|
||||
// failures related to unsigned integer overflows. This is only meant to
|
||||
// silence cases where this well defined behavior is expected.
|
||||
#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW |
||||
#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ |
||||
__attribute__((no_sanitize("unsigned-integer-overflow"))) |
||||
#endif |
||||
#endif |
||||
|
||||
// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
|
||||
// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
|
||||
#if !defined(WEBP_OFFSET_PTR) |
||||
#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) |
||||
#endif |
||||
|
||||
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
|
||||
#if !defined(WEBP_SWAP_16BIT_CSP) |
||||
#define WEBP_SWAP_16BIT_CSP 0 |
||||
#endif |
||||
|
||||
// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
|
||||
#if !defined(WORDS_BIGENDIAN) && \ |
||||
(defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
|
||||
(defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) |
||||
#define WORDS_BIGENDIAN |
||||
#endif |
||||
|
||||
typedef enum { |
||||
kSSE2, |
||||
kSSE3, |
||||
kSlowSSSE3, // special feature for slow SSSE3 architectures
|
||||
kSSE4_1, |
||||
kAVX, |
||||
kAVX2, |
||||
kNEON, |
||||
kMIPS32, |
||||
kMIPSdspR2, |
||||
kMSA |
||||
} CPUFeature; |
||||
|
||||
// returns true if the CPU supports the feature.
|
||||
typedef int (*VP8CPUInfo)(CPUFeature feature); |
||||
|
||||
#endif // WEBP_DSP_CPU_H_
|
@ -0,0 +1,133 @@ |
||||
// Copyright 2021 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// SSE41 variant of methods for lossless decoder
|
||||
|
||||
#include "src/dsp/dsp.h" |
||||
|
||||
#if defined(WEBP_USE_SSE41) |
||||
|
||||
#include "src/dsp/common_sse41.h" |
||||
#include "src/dsp/lossless.h" |
||||
#include "src/dsp/lossless_common.h" |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Color-space conversion functions
|
||||
|
||||
static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, |
||||
const uint32_t* const src, |
||||
int num_pixels, uint32_t* dst) { |
||||
// sign-extended multiplying constants, pre-shifted by 5.
|
||||
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
|
||||
const __m128i mults_rb = |
||||
_mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 | |
||||
(CST(green_to_blue_) & 0xffff))); |
||||
const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_)); |
||||
#undef CST |
||||
const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); |
||||
const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, |
||||
-1, 9, -1, 9, -1, 13, -1, 13); |
||||
const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, |
||||
-1, 10, -1, -1, -1, 14, -1, -1); |
||||
int i; |
||||
for (i = 0; i + 4 <= num_pixels; i += 4) { |
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(src + i)); |
||||
const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
|
||||
const __m128i C = _mm_mulhi_epi16(B, mults_rb); |
||||
const __m128i D = _mm_add_epi8(A, C); |
||||
const __m128i E = _mm_shuffle_epi8(D, perm2); |
||||
const __m128i F = _mm_mulhi_epi16(E, mults_b2); |
||||
const __m128i G = _mm_add_epi8(D, F); |
||||
const __m128i out = _mm_blendv_epi8(G, A, mask_ag); |
||||
_mm_storeu_si128((__m128i*)&dst[i], out); |
||||
} |
||||
// Fall-back to C-version for left-overs.
|
||||
if (i != num_pixels) { |
||||
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); |
||||
} |
||||
} |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define ARGB_TO_RGB_SSE41 do { \ |
||||
while (num_pixels >= 16) { \
|
||||
const __m128i in0 = _mm_loadu_si128(in + 0); \
|
||||
const __m128i in1 = _mm_loadu_si128(in + 1); \
|
||||
const __m128i in2 = _mm_loadu_si128(in + 2); \
|
||||
const __m128i in3 = _mm_loadu_si128(in + 3); \
|
||||
const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
|
||||
const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
|
||||
const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
|
||||
const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
|
||||
const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
|
||||
const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
|
||||
const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
|
||||
_mm_storeu_si128(out + 0, b0); \
|
||||
_mm_storeu_si128(out + 1, b1); \
|
||||
_mm_storeu_si128(out + 2, b2); \
|
||||
in += 4; \
|
||||
out += 3; \
|
||||
num_pixels -= 16; \
|
||||
} \
|
||||
} while (0) |
||||
|
||||
static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, |
||||
uint8_t* dst) { |
||||
const __m128i* in = (const __m128i*)src; |
||||
__m128i* out = (__m128i*)dst; |
||||
const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, |
||||
8, 14, 13, 12, -1, -1, -1, -1); |
||||
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
||||
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
||||
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
||||
|
||||
ARGB_TO_RGB_SSE41; |
||||
|
||||
// left-overs
|
||||
if (num_pixels > 0) { |
||||
VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
||||
} |
||||
} |
||||
|
||||
static void ConvertBGRAToBGR_SSE41(const uint32_t* src, |
||||
int num_pixels, uint8_t* dst) { |
||||
const __m128i* in = (const __m128i*)src; |
||||
__m128i* out = (__m128i*)dst; |
||||
const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, |
||||
12, 13, 14, -1, -1, -1, -1); |
||||
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); |
||||
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); |
||||
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); |
||||
|
||||
ARGB_TO_RGB_SSE41; |
||||
|
||||
// left-overs
|
||||
if (num_pixels > 0) { |
||||
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out); |
||||
} |
||||
} |
||||
|
||||
#undef ARGB_TO_RGB_SSE41 |
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void VP8LDspInitSSE41(void); |
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) { |
||||
VP8LTransformColorInverse = TransformColorInverse_SSE41; |
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41; |
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41; |
||||
} |
||||
|
||||
#else // !WEBP_USE_SSE41
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8LDspInitSSE41) |
||||
|
||||
#endif // WEBP_USE_SSE41
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue