mirror of https://github.com/FFmpeg/FFmpeg.git
Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>pull/2/head
parent
b54c0a552d
commit
5d50fcc549
19 changed files with 4165 additions and 37 deletions
@ -0,0 +1,115 @@ |
||||
/*
|
||||
* Copyright (C) 2007 Marco Gerards <marco@gnu.org> |
||||
* Copyright (C) 2009 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
/**
|
||||
* @file libavcodec/dirac_arith.c |
||||
* Arithmetic decoder for Dirac |
||||
* @author Marco Gerards <marco@gnu.org> |
||||
*/ |
||||
|
||||
#include "dirac_arith.h" |
||||
|
||||
const uint16_t ff_dirac_prob[256] = { |
||||
0, 2, 5, 8, 11, 15, 20, 24, |
||||
29, 35, 41, 47, 53, 60, 67, 74, |
||||
82, 89, 97, 106, 114, 123, 132, 141, |
||||
150, 160, 170, 180, 190, 201, 211, 222, |
||||
233, 244, 256, 267, 279, 291, 303, 315, |
||||
327, 340, 353, 366, 379, 392, 405, 419, |
||||
433, 447, 461, 475, 489, 504, 518, 533, |
||||
548, 563, 578, 593, 609, 624, 640, 656, |
||||
672, 688, 705, 721, 738, 754, 771, 788, |
||||
805, 822, 840, 857, 875, 892, 910, 928, |
||||
946, 964, 983, 1001, 1020, 1038, 1057, 1076, |
||||
1095, 1114, 1133, 1153, 1172, 1192, 1211, 1231, |
||||
1251, 1271, 1291, 1311, 1332, 1352, 1373, 1393, |
||||
1414, 1435, 1456, 1477, 1498, 1520, 1541, 1562, |
||||
1584, 1606, 1628, 1649, 1671, 1694, 1716, 1738, |
||||
1760, 1783, 1806, 1828, 1851, 1874, 1897, 1920, |
||||
1935, 1942, 1949, 1955, 1961, 1968, 1974, 1980, |
||||
1985, 1991, 1996, 2001, 2006, 2011, 2016, 2021, |
||||
2025, 2029, 2033, 2037, 2040, 2044, 2047, 2050, |
||||
2053, 2056, 2058, 2061, 2063, 2065, 2066, 2068, |
||||
2069, 2070, 2071, 2072, 2072, 2072, 2072, 2072, |
||||
2072, 2071, 2070, 2069, 2068, 2066, 2065, 2063, |
||||
2060, 2058, 2055, 2052, 2049, 2045, 2042, 2038, |
||||
2033, 2029, 2024, 2019, 2013, 2008, 2002, 1996, |
||||
1989, 1982, 1975, 1968, 1960, 1952, 1943, 1934, |
||||
1925, 1916, 1906, 1896, 1885, 1874, 1863, 1851, |
||||
1839, 1827, 1814, 1800, 1786, 1772, 1757, 1742, |
||||
1727, 1710, 1694, 1676, 1659, 1640, 1622, 1602, |
||||
1582, 1561, 1540, 1518, 1495, 1471, 1447, 1422, |
||||
1396, 1369, 1341, 1312, 1282, 1251, 1219, 1186, |
||||
1151, 1114, 1077, 1037, 995, 952, 906, 857, |
||||
805, 750, 690, 625, 553, 471, 376, 255 |
||||
}; |
||||
|
||||
const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT] = { |
||||
[CTX_ZPZN_F1] = CTX_ZP_F2, |
||||
[CTX_ZPNN_F1] = CTX_ZP_F2, |
||||
[CTX_ZP_F2] = CTX_ZP_F3, |
||||
[CTX_ZP_F3] = CTX_ZP_F4, |
||||
[CTX_ZP_F4] = CTX_ZP_F5, |
||||
[CTX_ZP_F5] = CTX_ZP_F6, |
||||
[CTX_ZP_F6] = CTX_ZP_F6, |
||||
[CTX_NPZN_F1] = CTX_NP_F2, |
||||
[CTX_NPNN_F1] = CTX_NP_F2, |
||||
[CTX_NP_F2] = CTX_NP_F3, |
||||
[CTX_NP_F3] = CTX_NP_F4, |
||||
[CTX_NP_F4] = CTX_NP_F5, |
||||
[CTX_NP_F5] = CTX_NP_F6, |
||||
[CTX_NP_F6] = CTX_NP_F6, |
||||
[CTX_DELTA_Q_F] = CTX_DELTA_Q_F, |
||||
}; |
||||
|
||||
int16_t ff_dirac_prob_branchless[256][2]; |
||||
|
||||
void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length) |
||||
{ |
||||
int i; |
||||
align_get_bits(gb); |
||||
|
||||
length = FFMIN(length, get_bits_left(gb)/8); |
||||
|
||||
c->bytestream = gb->buffer + get_bits_count(gb)/8; |
||||
c->bytestream_end = c->bytestream + length; |
||||
skip_bits_long(gb, length*8); |
||||
|
||||
c->low = 0; |
||||
for (i = 0; i < 4; i++) { |
||||
c->low <<= 8; |
||||
if (c->bytestream < c->bytestream_end) |
||||
c->low |= *c->bytestream++; |
||||
else |
||||
c->low |= 0xff; |
||||
} |
||||
|
||||
c->counter = -16; |
||||
c->range = 0xffff; |
||||
|
||||
for (i = 0; i < 256; i++) { |
||||
ff_dirac_prob_branchless[i][0] = ff_dirac_prob[255-i]; |
||||
ff_dirac_prob_branchless[i][1] = -ff_dirac_prob[i]; |
||||
} |
||||
|
||||
for (i = 0; i < DIRAC_CTX_COUNT; i++) |
||||
c->contexts[i] = 0x8000; |
||||
} |
@ -0,0 +1,190 @@ |
||||
/*
|
||||
* Copyright (C) 2007 Marco Gerards <marco@gnu.org> |
||||
* Copyright (C) 2009 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
/**
|
||||
* @file libavcodec/dirac_arith.h |
||||
* Arithmetic decoder for Dirac |
||||
* @author Marco Gerards <marco@gnu.org> |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_DIRAC_ARITH_H |
||||
#define AVCODEC_DIRAC_ARITH_H |
||||
|
||||
#include "bytestream.h" |
||||
#include "get_bits.h" |
||||
|
||||
enum dirac_arith_contexts { |
||||
CTX_ZPZN_F1, |
||||
CTX_ZPNN_F1, |
||||
CTX_NPZN_F1, |
||||
CTX_NPNN_F1, |
||||
CTX_ZP_F2, |
||||
CTX_ZP_F3, |
||||
CTX_ZP_F4, |
||||
CTX_ZP_F5, |
||||
CTX_ZP_F6, |
||||
CTX_NP_F2, |
||||
CTX_NP_F3, |
||||
CTX_NP_F4, |
||||
CTX_NP_F5, |
||||
CTX_NP_F6, |
||||
CTX_COEFF_DATA, |
||||
CTX_SIGN_NEG, |
||||
CTX_SIGN_ZERO, |
||||
CTX_SIGN_POS, |
||||
CTX_ZERO_BLOCK, |
||||
CTX_DELTA_Q_F, |
||||
CTX_DELTA_Q_DATA, |
||||
CTX_DELTA_Q_SIGN, |
||||
|
||||
DIRAC_CTX_COUNT |
||||
}; |
||||
|
||||
// Dirac resets the arith decoder between decoding various types of data,
|
||||
// so many contexts are never used simultaneously. Thus, we can reduce
|
||||
// the number of contexts needed by reusing them.
|
||||
#define CTX_SB_F1 CTX_ZP_F5 |
||||
#define CTX_SB_DATA 0 |
||||
#define CTX_PMODE_REF1 0 |
||||
#define CTX_PMODE_REF2 1 |
||||
#define CTX_GLOBAL_BLOCK 2 |
||||
#define CTX_MV_F1 CTX_ZP_F2 |
||||
#define CTX_MV_DATA 0 |
||||
#define CTX_DC_F1 CTX_ZP_F5 |
||||
#define CTX_DC_DATA 0 |
||||
|
||||
typedef struct { |
||||
unsigned low; |
||||
uint16_t range; |
||||
int16_t counter; |
||||
|
||||
const uint8_t *bytestream; |
||||
const uint8_t *bytestream_end; |
||||
|
||||
uint16_t contexts[DIRAC_CTX_COUNT]; |
||||
} DiracArith; |
||||
|
||||
extern const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT]; |
||||
extern const uint16_t ff_dirac_prob[256]; |
||||
extern int16_t ff_dirac_prob_branchless[256][2]; |
||||
|
||||
static inline void renorm(DiracArith *c) |
||||
{ |
||||
#if HAVE_FAST_CLZ |
||||
int shift = 14 - av_log2_16bit(c->range-1) + ((c->range-1)>>15); |
||||
|
||||
c->low <<= shift; |
||||
c->range <<= shift; |
||||
c->counter += shift; |
||||
#else |
||||
while (c->range <= 0x4000) { |
||||
c->low <<= 1; |
||||
c->range <<= 1; |
||||
c->counter++; |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
static inline void refill(DiracArith *c) |
||||
{ |
||||
int counter = c->counter; |
||||
|
||||
if (counter >= 0) { |
||||
int new = bytestream_get_be16(&c->bytestream); |
||||
|
||||
// the spec defines overread bits to be 1, and streams rely on this
|
||||
if (c->bytestream > c->bytestream_end) { |
||||
new |= 0xff; |
||||
if (c->bytestream > c->bytestream_end+1) |
||||
new |= 0xff00; |
||||
|
||||
c->bytestream = c->bytestream_end; |
||||
} |
||||
|
||||
c->low += new << counter; |
||||
counter -= 16; |
||||
} |
||||
c->counter = counter; |
||||
} |
||||
|
||||
static inline int dirac_get_arith_bit(DiracArith *c, int ctx) |
||||
{ |
||||
int prob_zero = c->contexts[ctx]; |
||||
int range_times_prob, bit; |
||||
unsigned low = c->low; |
||||
int range = c->range; |
||||
|
||||
range_times_prob = (c->range * prob_zero) >> 16; |
||||
|
||||
#if HAVE_FAST_CMOV |
||||
low -= range_times_prob << 16; |
||||
range -= range_times_prob; |
||||
bit = 0; |
||||
__asm__( |
||||
"cmpl %5, %4 \n\t" |
||||
"setae %b0 \n\t" |
||||
"cmovb %3, %2 \n\t" |
||||
"cmovb %5, %1 \n\t" |
||||
: "+q"(bit), "+r"(range), "+r"(low) |
||||
: "r"(c->low), "r"(c->low>>16), |
||||
"r"(range_times_prob) |
||||
); |
||||
#else |
||||
bit = (low >> 16) >= range_times_prob; |
||||
if (bit) { |
||||
low -= range_times_prob << 16; |
||||
range -= range_times_prob; |
||||
} else { |
||||
range = range_times_prob; |
||||
} |
||||
#endif |
||||
|
||||
c->contexts[ctx] += ff_dirac_prob_branchless[prob_zero>>8][bit]; |
||||
c->low = low; |
||||
c->range = range; |
||||
|
||||
renorm(c); |
||||
refill(c); |
||||
return bit; |
||||
} |
||||
|
||||
static inline int dirac_get_arith_uint(DiracArith *c, int follow_ctx, int data_ctx) |
||||
{ |
||||
int ret = 1; |
||||
while (!dirac_get_arith_bit(c, follow_ctx)) { |
||||
ret <<= 1; |
||||
ret += dirac_get_arith_bit(c, data_ctx); |
||||
follow_ctx = ff_dirac_next_ctx[follow_ctx]; |
||||
} |
||||
return ret-1; |
||||
} |
||||
|
||||
static inline int dirac_get_arith_int(DiracArith *c, int follow_ctx, int data_ctx) |
||||
{ |
||||
int ret = dirac_get_arith_uint(c, follow_ctx, data_ctx); |
||||
if (ret && dirac_get_arith_bit(c, data_ctx+1)) |
||||
ret = -ret; |
||||
return ret; |
||||
} |
||||
|
||||
void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length); |
||||
|
||||
#endif /* AVCODEC_DIRAC_ARITH_H */ |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,201 @@ |
||||
/*
|
||||
* Copyright (C) 2009 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "dsputil.h" |
||||
#include "diracdsp.h" |
||||
//MMX_DISABLE #include "libavcodec/x86/diracdsp_mmx.h"
|
||||
|
||||
#define FILTER(src, stride) \ |
||||
((21*((src)[ 0*stride] + (src)[1*stride]) \
|
||||
-7*((src)[-1*stride] + (src)[2*stride]) \
|
||||
+3*((src)[-2*stride] + (src)[3*stride]) \
|
||||
-1*((src)[-3*stride] + (src)[4*stride]) + 16) >> 5) |
||||
|
||||
static void dirac_hpel_filter(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, |
||||
int stride, int width, int height) |
||||
{ |
||||
int x, y; |
||||
|
||||
for (y = 0; y < height; y++) { |
||||
for (x = -3; x < width+5; x++) |
||||
dstv[x] = av_clip_uint8(FILTER(src+x, stride)); |
||||
|
||||
for (x = 0; x < width; x++) |
||||
dstc[x] = av_clip_uint8(FILTER(dstv+x, 1)); |
||||
|
||||
for (x = 0; x < width; x++) |
||||
dsth[x] = av_clip_uint8(FILTER(src+x, 1)); |
||||
|
||||
src += stride; |
||||
dsth += stride; |
||||
dstv += stride; |
||||
dstc += stride; |
||||
} |
||||
} |
||||
|
||||
#define PIXOP_BILINEAR(PFX, OP, WIDTH) \ |
||||
static void ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
int x;\
|
||||
const uint8_t *s0 = src[0];\
|
||||
const uint8_t *s1 = src[1];\
|
||||
const uint8_t *s2 = src[2];\
|
||||
const uint8_t *s3 = src[3];\
|
||||
const uint8_t *w = src[4];\
|
||||
\
|
||||
while (h--) {\
|
||||
for (x = 0; x < WIDTH; x++) {\
|
||||
OP(dst[x], (s0[x]*w[0] + s1[x]*w[1] + s2[x]*w[2] + s3[x]*w[3] + 8) >> 4);\
|
||||
}\
|
||||
\
|
||||
dst += stride;\
|
||||
s0 += stride;\
|
||||
s1 += stride;\
|
||||
s2 += stride;\
|
||||
s3 += stride;\
|
||||
}\
|
||||
} |
||||
|
||||
#define OP_PUT(dst, val) (dst) = (val) |
||||
#define OP_AVG(dst, val) (dst) = (((dst) + (val) + 1)>>1) |
||||
|
||||
PIXOP_BILINEAR(put, OP_PUT, 8) |
||||
PIXOP_BILINEAR(put, OP_PUT, 16) |
||||
PIXOP_BILINEAR(put, OP_PUT, 32) |
||||
PIXOP_BILINEAR(avg, OP_AVG, 8) |
||||
PIXOP_BILINEAR(avg, OP_AVG, 16) |
||||
PIXOP_BILINEAR(avg, OP_AVG, 32) |
||||
|
||||
#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + (1<<(log2_denom-1))) >> log2_denom) |
||||
#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + (1<<(log2_denom-1))) >> log2_denom) |
||||
|
||||
#define DIRAC_WEIGHT(W) \ |
||||
static void weight_dirac_pixels ## W ## _c(uint8_t *block, int stride, int log2_denom, \
|
||||
int weight, int h) { \
|
||||
int x; \
|
||||
while (h--) { \
|
||||
for (x = 0; x < W; x++) { \
|
||||
op_scale1(x); \
|
||||
op_scale1(x+1); \
|
||||
} \
|
||||
block += stride; \
|
||||
} \
|
||||
} \
|
||||
static void biweight_dirac_pixels ## W ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
|
||||
int weightd, int weights, int h) { \
|
||||
int x; \
|
||||
while (h--) { \
|
||||
for (x = 0; x < W; x++) { \
|
||||
op_scale2(x); \
|
||||
op_scale2(x+1); \
|
||||
} \
|
||||
dst += stride; \
|
||||
src += stride; \
|
||||
} \
|
||||
} |
||||
|
||||
DIRAC_WEIGHT(8) |
||||
DIRAC_WEIGHT(16) |
||||
DIRAC_WEIGHT(32) |
||||
|
||||
#define ADD_OBMC(xblen) \ |
||||
static void add_obmc ## xblen ## _c(uint16_t *dst, const uint8_t *src, int stride, \
|
||||
const uint8_t *obmc_weight, int yblen) \
|
||||
{ \
|
||||
int x; \
|
||||
while (yblen--) { \
|
||||
for (x = 0; x < xblen; x += 2) { \
|
||||
dst[x ] += src[x ] * obmc_weight[x ]; \
|
||||
dst[x+1] += src[x+1] * obmc_weight[x+1]; \
|
||||
} \
|
||||
dst += stride; \
|
||||
src += stride; \
|
||||
obmc_weight += 32; \
|
||||
} \
|
||||
} |
||||
|
||||
ADD_OBMC(8) |
||||
ADD_OBMC(16) |
||||
ADD_OBMC(32) |
||||
|
||||
static void put_signed_rect_clamped_c(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height) |
||||
{ |
||||
int x, y; |
||||
for (y = 0; y < height; y++) { |
||||
for (x = 0; x < width; x+=4) { |
||||
dst[x ] = av_clip_uint8(src[x ] + 128); |
||||
dst[x+1] = av_clip_uint8(src[x+1] + 128); |
||||
dst[x+2] = av_clip_uint8(src[x+2] + 128); |
||||
dst[x+3] = av_clip_uint8(src[x+3] + 128); |
||||
} |
||||
dst += dst_stride; |
||||
src += src_stride; |
||||
} |
||||
} |
||||
|
||||
static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride, |
||||
const int16_t *idwt, int idwt_stride, |
||||
int width, int height) |
||||
{ |
||||
int x, y; |
||||
|
||||
for (y = 0; y < height; y++) { |
||||
for (x = 0; x < width; x+=2) { |
||||
dst[x ] = av_clip_uint8(((src[x ]+32)>>6) + idwt[x ]); |
||||
dst[x+1] = av_clip_uint8(((src[x+1]+32)>>6) + idwt[x+1]); |
||||
} |
||||
dst += stride; |
||||
src += stride; |
||||
idwt += idwt_stride; |
||||
} |
||||
} |
||||
|
||||
#define PIXFUNC(PFX, WIDTH) \ |
||||
c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
|
||||
c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
|
||||
c->PFX ## _dirac_pixels_tab[WIDTH>>4][2] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l4_c; \
|
||||
c->PFX ## _dirac_pixels_tab[WIDTH>>4][3] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c |
||||
|
||||
void ff_diracdsp_init(DiracDSPContext *c) |
||||
{ |
||||
c->dirac_hpel_filter = dirac_hpel_filter; |
||||
c->add_rect_clamped = add_rect_clamped_c; |
||||
c->put_signed_rect_clamped = put_signed_rect_clamped_c; |
||||
|
||||
c->add_dirac_obmc[0] = add_obmc8_c; |
||||
c->add_dirac_obmc[1] = add_obmc16_c; |
||||
c->add_dirac_obmc[2] = add_obmc32_c; |
||||
|
||||
c->weight_dirac_pixels_tab[0] = weight_dirac_pixels8_c; |
||||
c->weight_dirac_pixels_tab[1] = weight_dirac_pixels16_c; |
||||
c->weight_dirac_pixels_tab[2] = weight_dirac_pixels32_c; |
||||
c->biweight_dirac_pixels_tab[0] = biweight_dirac_pixels8_c; |
||||
c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; |
||||
c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; |
||||
|
||||
PIXFUNC(put, 8); |
||||
PIXFUNC(put, 16); |
||||
PIXFUNC(put, 32); |
||||
PIXFUNC(avg, 8); |
||||
PIXFUNC(avg, 16); |
||||
PIXFUNC(avg, 32); |
||||
|
||||
//MMX_DISABLE if (HAVE_MMX) ff_diracdsp_init_mmx(c);
|
||||
} |
@ -0,0 +1,65 @@ |
||||
/*
|
||||
* Copyright (C) 2010 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_DIRACDSP_H |
||||
#define AVCODEC_DIRACDSP_H |
||||
|
||||
typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h); |
||||
typedef void (*dirac_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h); |
||||
|
||||
typedef struct { |
||||
void (*dirac_hpel_filter)(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height); |
||||
/**
|
||||
* dirac_pixels_tab[width][subpel] |
||||
* width is 2 for 32, 1 for 16, 0 for 8 |
||||
* subpel is 0 for fpel and hpel (only need to copy from the first plane in src) |
||||
* 1 if an average of the first 2 planes is needed (TODO: worth it?) |
||||
* 2 for general qpel (avg of 4) |
||||
* 3 for general epel (biweight of 4 using the weights in src[4]) |
||||
* src[0-3] is each of the hpel planes |
||||
* src[4] is the 1/8 pel weights if needed |
||||
*/ |
||||
void (*put_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
void (*avg_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
|
||||
void (*put_signed_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const int16_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/); |
||||
void (*put_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const int16_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/); |
||||
void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/); |
||||
void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
|
||||
dirac_weight_func weight_dirac_pixels_tab[3]; |
||||
dirac_biweight_func biweight_dirac_pixels_tab[3]; |
||||
} DiracDSPContext; |
||||
|
||||
#define DECL_DIRAC_PIXOP(PFX, EXT) \ |
||||
void ff_ ## PFX ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
|
||||
void ff_ ## PFX ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
|
||||
void ff_ ## PFX ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
||||
|
||||
DECL_DIRAC_PIXOP(put, c); |
||||
DECL_DIRAC_PIXOP(avg, c); |
||||
DECL_DIRAC_PIXOP(put, l2_c); |
||||
DECL_DIRAC_PIXOP(avg, l2_c); |
||||
DECL_DIRAC_PIXOP(put, l4_c); |
||||
DECL_DIRAC_PIXOP(avg, l4_c); |
||||
|
||||
void ff_diracdsp_init(DiracDSPContext *c); |
||||
|
||||
#endif |
@ -0,0 +1,95 @@ |
||||
/*
|
||||
* Copyright (C) 2010 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "dsputil_mmx.h" |
||||
#include "diracdsp_mmx.h" |
||||
|
||||
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
||||
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
||||
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
||||
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \ |
||||
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, uint8_t *, int, int);\
|
||||
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, uint8_t *, int);\
|
||||
\
|
||||
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
|
||||
uint8_t *src, int stride, int width, int height)\
|
||||
{\
|
||||
while( height-- )\
|
||||
{\
|
||||
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5);\
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);\
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);\
|
||||
\
|
||||
dsth += stride;\
|
||||
dstv += stride;\
|
||||
dstc += stride;\
|
||||
src += stride;\
|
||||
}\
|
||||
} |
||||
|
||||
#if !ARCH_X86_64 |
||||
HPEL_FILTER(8, mmx) |
||||
#endif |
||||
HPEL_FILTER(16, sse2) |
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) \ |
||||
c->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT; \
|
||||
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
|
||||
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT |
||||
|
||||
void ff_diracdsp_init_mmx(DiracDSPContext* c) |
||||
{ |
||||
int mm_flags = av_get_cpu_flags();; |
||||
|
||||
#if HAVE_YASM |
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; |
||||
#if !ARCH_X86_64 |
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; |
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; |
||||
c->dirac_hpel_filter = dirac_hpel_filter_mmx; |
||||
c->add_rect_clamped = ff_add_rect_clamped_mmx; |
||||
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; |
||||
#endif |
||||
#endif |
||||
|
||||
PIXFUNC(put, 0, mmx); |
||||
PIXFUNC(avg, 0, mmx); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX2) { |
||||
PIXFUNC(avg, 0, mmx2); |
||||
} |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) { |
||||
#if HAVE_YASM |
||||
c->dirac_hpel_filter = dirac_hpel_filter_sse2; |
||||
c->add_rect_clamped = ff_add_rect_clamped_sse2; |
||||
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; |
||||
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; |
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; |
||||
#endif |
||||
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; |
||||
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; |
||||
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; |
||||
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; |
||||
} |
||||
} |
@ -0,0 +1,47 @@ |
||||
/*
|
||||
* Copyright (c) 2010 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_X86_DIRACDSP_H |
||||
#define AVCODEC_X86_DIRACDSP_H |
||||
|
||||
#include "libavcodec/diracdsp.h" |
||||
|
||||
void ff_diracdsp_init_mmx(DiracDSPContext* c); |
||||
|
||||
DECL_DIRAC_PIXOP(put, mmx); |
||||
DECL_DIRAC_PIXOP(avg, mmx); |
||||
DECL_DIRAC_PIXOP(avg, mmx2); |
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
||||
|
||||
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); |
||||
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); |
||||
|
||||
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
|
||||
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
||||
|
||||
#endif |
@ -0,0 +1,260 @@ |
||||
;****************************************************************************** |
||||
;* Copyright (c) 2010 David Conrad |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "x86inc.asm" |
||||
|
||||
SECTION_RODATA |
||||
pw_3: times 8 dw 3 |
||||
pw_7: times 8 dw 7 |
||||
pw_16: times 8 dw 16 |
||||
pw_32: times 8 dw 32 |
||||
pb_128: times 16 db 128 |
||||
|
||||
section .text |
||||
|
||||
%macro UNPACK_ADD 6 |
||||
mov%5 %1, %3 |
||||
mov%6 m5, %4 |
||||
mova m4, %1 |
||||
mova %2, m5 |
||||
punpcklbw %1, m7 |
||||
punpcklbw m5, m7 |
||||
punpckhbw m4, m7 |
||||
punpckhbw %2, m7 |
||||
paddw %1, m5 |
||||
paddw %2, m4 |
||||
%endmacro |
||||
|
||||
%macro HPEL_FILTER 1 |
||||
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); |
||||
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 |
||||
mov src0q, srcq |
||||
lea stridex3q, [3*strideq] |
||||
sub src0q, stridex3q |
||||
pxor m7, m7 |
||||
.loop: |
||||
; 7*(src[0] + src[1]) |
||||
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a |
||||
pmullw m0, [pw_7] |
||||
pmullw m1, [pw_7] |
||||
|
||||
; 3*( ... + src[-2] + src[3]) |
||||
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a |
||||
paddw m0, m2 |
||||
paddw m1, m3 |
||||
pmullw m0, [pw_3] |
||||
pmullw m1, [pw_3] |
||||
|
||||
; ... - 7*(src[-1] + src[2]) |
||||
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a |
||||
pmullw m2, [pw_7] |
||||
pmullw m3, [pw_7] |
||||
psubw m0, m2 |
||||
psubw m1, m3 |
||||
|
||||
; ... - (src[-3] + src[4]) |
||||
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a |
||||
psubw m0, m2 |
||||
psubw m1, m3 |
||||
|
||||
paddw m0, [pw_16] |
||||
paddw m1, [pw_16] |
||||
psraw m0, 5 |
||||
psraw m1, 5 |
||||
packuswb m0, m1 |
||||
mova [dstq], m0 |
||||
add dstq, mmsize |
||||
add srcq, mmsize |
||||
add src0q, mmsize |
||||
sub widthd, mmsize |
||||
jg .loop |
||||
RET |
||||
|
||||
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); |
||||
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width |
||||
dec widthd |
||||
pxor m7, m7 |
||||
and widthd, ~(mmsize-1) |
||||
.loop: |
||||
; 7*(src[0] + src[1]) |
||||
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], a,u |
||||
pmullw m0, [pw_7] |
||||
pmullw m1, [pw_7] |
||||
|
||||
; 3*( ... + src[-2] + src[3]) |
||||
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u |
||||
paddw m0, m2 |
||||
paddw m1, m3 |
||||
pmullw m0, [pw_3] |
||||
pmullw m1, [pw_3] |
||||
|
||||
; ... - 7*(src[-1] + src[2]) |
||||
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u |
||||
pmullw m2, [pw_7] |
||||
pmullw m3, [pw_7] |
||||
psubw m0, m2 |
||||
psubw m1, m3 |
||||
|
||||
; ... - (src[-3] + src[4]) |
||||
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u |
||||
psubw m0, m2 |
||||
psubw m1, m3 |
||||
|
||||
paddw m0, [pw_16] |
||||
paddw m1, [pw_16] |
||||
psraw m0, 5 |
||||
psraw m1, 5 |
||||
packuswb m0, m1 |
||||
mova [dstq + widthq], m0 |
||||
sub widthd, mmsize |
||||
jge .loop |
||||
RET |
||||
%endmacro |
||||
|
||||
%macro PUT_RECT 1 |
||||
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) |
||||
cglobal put_signed_rect_clamped_%1, 5,7,3, dst, dst_stride, src, src_stride, w, dst2, src2 |
||||
mova m0, [pb_128] |
||||
add wd, (mmsize-1) |
||||
and wd, ~(mmsize-1) |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
mov r10d, r5m |
||||
mov r11d, wd |
||||
%define wspill r11d |
||||
%define hd r10d |
||||
%else |
||||
mov r4m, wd |
||||
%define wspill r4m |
||||
%define hd r5mp |
||||
%endif |
||||
|
||||
.loopy |
||||
lea src2q, [srcq+src_strideq*2] |
||||
lea dst2q, [dstq+dst_strideq] |
||||
.loopx: |
||||
sub wd, mmsize |
||||
mova m1, [srcq +2*wq] |
||||
mova m2, [src2q+2*wq] |
||||
packsswb m1, [srcq +2*wq+mmsize] |
||||
packsswb m2, [src2q+2*wq+mmsize] |
||||
paddb m1, m0 |
||||
paddb m2, m0 |
||||
mova [dstq +wq], m1 |
||||
mova [dst2q+wq], m2 |
||||
jg .loopx |
||||
|
||||
lea srcq, [srcq+src_strideq*4] |
||||
lea dstq, [dstq+dst_strideq*2] |
||||
sub hd, 2 |
||||
mov wd, wspill |
||||
jg .loopy |
||||
RET |
||||
%endm |
||||
|
||||
%macro ADD_RECT 1 |
||||
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) |
||||
cglobal add_rect_clamped_%1, 7,7,3, dst, src, stride, idwt, idwt_stride, w, h |
||||
mova m0, [pw_32] |
||||
add wd, (mmsize-1) |
||||
and wd, ~(mmsize-1) |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
mov r11d, wd |
||||
%define wspill r11d |
||||
%else |
||||
mov r5m, wd |
||||
%define wspill r5m |
||||
%endif |
||||
|
||||
.loop: |
||||
sub wd, mmsize |
||||
movu m1, [srcq +2*wq] ; FIXME: ensure alignment |
||||
paddw m1, m0 |
||||
psraw m1, 6 |
||||
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment |
||||
paddw m2, m0 |
||||
psraw m2, 6 |
||||
paddw m1, [idwtq+2*wq] |
||||
paddw m2, [idwtq+2*wq+mmsize] |
||||
packuswb m1, m2 |
||||
mova [dstq +wq], m1 |
||||
jg .loop |
||||
|
||||
lea srcq, [srcq + 2*strideq] |
||||
add dstq, strideq |
||||
lea idwtq, [idwtq+ 2*idwt_strideq] |
||||
sub hd, 1 |
||||
mov wd, wspill |
||||
jg .loop |
||||
RET |
||||
%endm |
||||
|
||||
%macro ADD_OBMC 2 |
||||
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) |
||||
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen |
||||
pxor m4, m4 |
||||
.loop: |
||||
%assign i 0 |
||||
%rep %1 / mmsize |
||||
mova m0, [srcq+i] |
||||
mova m1, m0 |
||||
punpcklbw m0, m4 |
||||
punpckhbw m1, m4 |
||||
mova m2, [obmcq+i] |
||||
mova m3, m2 |
||||
punpcklbw m2, m4 |
||||
punpckhbw m3, m4 |
||||
pmullw m0, m2 |
||||
pmullw m1, m3 |
||||
movu m2, [dstq+2*i] |
||||
movu m3, [dstq+2*i+mmsize] |
||||
paddw m0, m2 |
||||
paddw m1, m3 |
||||
movu [dstq+2*i], m0 |
||||
movu [dstq+2*i+mmsize], m1 |
||||
%assign i i+mmsize |
||||
%endrep |
||||
lea srcq, [srcq+strideq] |
||||
lea dstq, [dstq+2*strideq] |
||||
add obmcq, 32 |
||||
sub yblend, 1 |
||||
jg .loop |
||||
RET |
||||
%endm |
||||
|
||||
INIT_MMX |
||||
%ifndef ARCH_X86_64 |
||||
PUT_RECT mmx |
||||
ADD_RECT mmx |
||||
|
||||
HPEL_FILTER mmx |
||||
ADD_OBMC 32, mmx |
||||
ADD_OBMC 16, mmx |
||||
%endif |
||||
ADD_OBMC 8, mmx |
||||
|
||||
INIT_XMM |
||||
PUT_RECT sse2 |
||||
ADD_RECT sse2 |
||||
|
||||
HPEL_FILTER sse2 |
||||
ADD_OBMC 32, sse2 |
||||
ADD_OBMC 16, sse2 |
@ -0,0 +1,195 @@ |
||||
/*
|
||||
* MMX optimized discrete wavelet transform |
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* Copyright (c) 2010 David Conrad |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/x86_cpu.h" |
||||
#include "dsputil_mmx.h" |
||||
#include "dwt.h" |
||||
|
||||
#define COMPOSE_VERTICAL(ext, align) \ |
||||
void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
|
||||
void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
|
||||
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
|
||||
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
|
||||
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
|
||||
\
|
||||
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
|
||||
IDWTELEM *b3, IDWTELEM *b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
|
||||
IDWTELEM *b3, IDWTELEM *b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) { \
|
||||
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
|
||||
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
|
||||
} \
|
||||
\
|
||||
ff_vertical_compose_haar##ext(b0, b1, width_align); \
|
||||
} \
|
||||
\
|
||||
|
||||
#if HAVE_YASM |
||||
#if !ARCH_X86_64 |
||||
COMPOSE_VERTICAL(_mmx, 4) |
||||
#endif |
||||
COMPOSE_VERTICAL(_sse2, 8) |
||||
#endif |
||||
|
||||
|
||||
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); |
||||
|
||||
void ff_horizontal_compose_haar0i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w); |
||||
void ff_horizontal_compose_haar1i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w); |
||||
void ff_horizontal_compose_haar0i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w); |
||||
void ff_horizontal_compose_haar1i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w); |
||||
|
||||
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x) |
||||
{ |
||||
for (; x < w2; x++) { |
||||
b[2*x ] = (tmp[x] + 1)>>1; |
||||
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; |
||||
} |
||||
} |
||||
|
||||
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x) |
||||
{ |
||||
for (; x < w2; x++) { |
||||
b[2*x ] = tmp[x]; |
||||
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]); |
||||
} |
||||
} |
||||
|
||||
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x) |
||||
{ |
||||
for (; x < w2; x++) { |
||||
b[2*x ] = (tmp[x] + 1)>>1; |
||||
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1; |
||||
} |
||||
} |
||||
|
||||
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) |
||||
{ |
||||
#if HAVE_YASM |
||||
int mm_flags = av_get_cpu_flags();; |
||||
|
||||
#if !ARCH_X86_64 |
||||
if (!(mm_flags & AV_CPU_FLAG_MMX)) |
||||
return; |
||||
|
||||
switch (type) { |
||||
case DWT_DIRAC_DD9_7: |
||||
d->vertical_compose_l0 = vertical_compose53iL0_mmx; |
||||
d->vertical_compose_h0 = vertical_compose_dd97iH0_mmx; |
||||
break; |
||||
case DWT_DIRAC_LEGALL5_3: |
||||
d->vertical_compose_l0 = vertical_compose53iL0_mmx; |
||||
d->vertical_compose_h0 = vertical_compose_dirac53iH0_mmx; |
||||
break; |
||||
case DWT_DIRAC_DD13_7: |
||||
d->vertical_compose_l0 = vertical_compose_dd137iL0_mmx; |
||||
d->vertical_compose_h0 = vertical_compose_dd97iH0_mmx; |
||||
break; |
||||
case DWT_DIRAC_HAAR0: |
||||
d->vertical_compose = vertical_compose_haar_mmx; |
||||
d->horizontal_compose = ff_horizontal_compose_haar0i_mmx; |
||||
break; |
||||
case DWT_DIRAC_HAAR1: |
||||
d->vertical_compose = vertical_compose_haar_mmx; |
||||
d->horizontal_compose = ff_horizontal_compose_haar1i_mmx; |
||||
break; |
||||
} |
||||
#endif |
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2)) |
||||
return; |
||||
|
||||
switch (type) { |
||||
case DWT_DIRAC_DD9_7: |
||||
d->vertical_compose_l0 = vertical_compose53iL0_sse2; |
||||
d->vertical_compose_h0 = vertical_compose_dd97iH0_sse2; |
||||
break; |
||||
case DWT_DIRAC_LEGALL5_3: |
||||
d->vertical_compose_l0 = vertical_compose53iL0_sse2; |
||||
d->vertical_compose_h0 = vertical_compose_dirac53iH0_sse2; |
||||
break; |
||||
case DWT_DIRAC_DD13_7: |
||||
d->vertical_compose_l0 = vertical_compose_dd137iL0_sse2; |
||||
d->vertical_compose_h0 = vertical_compose_dd97iH0_sse2; |
||||
break; |
||||
case DWT_DIRAC_HAAR0: |
||||
d->vertical_compose = vertical_compose_haar_sse2; |
||||
d->horizontal_compose = ff_horizontal_compose_haar0i_sse2; |
||||
break; |
||||
case DWT_DIRAC_HAAR1: |
||||
d->vertical_compose = vertical_compose_haar_sse2; |
||||
d->horizontal_compose = ff_horizontal_compose_haar1i_sse2; |
||||
break; |
||||
} |
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSSE3)) |
||||
return; |
||||
|
||||
switch (type) { |
||||
case DWT_DIRAC_DD9_7: |
||||
d->horizontal_compose = ff_horizontal_compose_dd97i_ssse3; |
||||
break; |
||||
} |
||||
#endif // HAVE_YASM
|
||||
} |
@ -0,0 +1,30 @@ |
||||
/*
|
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_X86_DWT_H |
||||
#define AVCODEC_X86_DWT_H |
||||
|
||||
#include "libavcodec/dwt.h" |
||||
|
||||
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
||||
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
||||
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
||||
|
||||
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); |
||||
|
||||
#endif |
@ -0,0 +1,312 @@ |
||||
;****************************************************************************** |
||||
;* MMX optimized discrete wavelet trasnform |
||||
;* Copyright (c) 2010 David Conrad |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "x86inc.asm" |
||||
|
||||
cextern horizontal_compose_dd97i_end_c |
||||
cextern horizontal_compose_haar0i_end_c |
||||
cextern horizontal_compose_haar1i_end_c |
||||
|
||||
SECTION_RODATA |
||||
pw_1: times 8 dw 1 |
||||
pw_2: times 8 dw 2 |
||||
pw_8: times 8 dw 8 |
||||
pw_16: times 8 dw 16 |
||||
pw_1991: times 4 dw 9,-1 |
||||
|
||||
section .text |
||||
|
||||
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 |
||||
%macro COMPOSE_53iL0 4 |
||||
paddw %2, %3 |
||||
paddw %2, %4 |
||||
psraw %2, 2 |
||||
psubw %1, %2 |
||||
%endm |
||||
|
||||
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 |
||||
; if %4 is supplied, %1 is loaded unaligned from there |
||||
; m2: clobbered m3: pw_8 m4: pw_1991 |
||||
%macro COMPOSE_DD97iH0 3-4 |
||||
paddw m0, %3 |
||||
paddw m1, %2 |
||||
psubw m0, m3 |
||||
mova m2, m1 |
||||
punpcklwd m1, m0 |
||||
punpckhwd m2, m0 |
||||
pmaddwd m1, m4 |
||||
pmaddwd m2, m4 |
||||
%if %0 > 3 |
||||
movu %1, %4 |
||||
%endif |
||||
psrad m1, 4 |
||||
psrad m2, 4 |
||||
packssdw m1, m2 |
||||
paddw m1, %1 |
||||
%endm |
||||
|
||||
%macro COMPOSE_VERTICAL 1 |
||||
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||||
; int width) |
||||
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width |
||||
mova m2, [pw_2] |
||||
.loop: |
||||
sub widthd, mmsize/2 |
||||
mova m1, [b0q+2*widthq] |
||||
mova m0, [b1q+2*widthq] |
||||
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 |
||||
mova [b1q+2*widthq], m0 |
||||
jg .loop |
||||
REP_RET |
||||
|
||||
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||||
; int width) |
||||
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width |
||||
mova m1, [pw_1] |
||||
.loop: |
||||
sub widthd, mmsize/2 |
||||
mova m0, [b0q+2*widthq] |
||||
paddw m0, [b2q+2*widthq] |
||||
paddw m0, m1 |
||||
psraw m0, 1 |
||||
paddw m0, [b1q+2*widthq] |
||||
mova [b1q+2*widthq], m0 |
||||
jg .loop |
||||
REP_RET |
||||
|
||||
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||||
; IDWTELEM *b3, IDWTELEM *b4, int width) |
||||
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width |
||||
mova m3, [pw_8] |
||||
mova m4, [pw_1991] |
||||
.loop: |
||||
sub widthd, mmsize/2 |
||||
mova m0, [b0q+2*widthq] |
||||
mova m1, [b1q+2*widthq] |
||||
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] |
||||
mova [b2q+2*widthq], m1 |
||||
jg .loop |
||||
REP_RET |
||||
|
||||
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||||
; IDWTELEM *b3, IDWTELEM *b4, int width) |
||||
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width |
||||
mova m3, [pw_16] |
||||
mova m4, [pw_1991] |
||||
.loop: |
||||
sub widthd, mmsize/2 |
||||
mova m0, [b0q+2*widthq] |
||||
mova m1, [b1q+2*widthq] |
||||
mova m5, [b2q+2*widthq] |
||||
paddw m0, [b4q+2*widthq] |
||||
paddw m1, [b3q+2*widthq] |
||||
psubw m0, m3 |
||||
mova m2, m1 |
||||
punpcklwd m1, m0 |
||||
punpckhwd m2, m0 |
||||
pmaddwd m1, m4 |
||||
pmaddwd m2, m4 |
||||
psrad m1, 5 |
||||
psrad m2, 5 |
||||
packssdw m1, m2 |
||||
psubw m5, m1 |
||||
mova [b2q+2*widthq], m5 |
||||
jg .loop |
||||
REP_RET |
||||
|
||||
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) |
||||
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width |
||||
mova m3, [pw_1] |
||||
.loop: |
||||
sub widthd, mmsize/2 |
||||
mova m1, [b1q+2*widthq] |
||||
mova m0, [b0q+2*widthq] |
||||
mova m2, m1 |
||||
paddw m1, m3 |
||||
psraw m1, 1 |
||||
psubw m0, m1 |
||||
mova [b0q+2*widthq], m0 |
||||
paddw m2, m0 |
||||
mova [b1q+2*widthq], m2 |
||||
jg .loop |
||||
REP_RET |
||||
%endmacro |
||||
|
||||
; extend the left and right edges of the tmp array by %1 and %2 respectively |
||||
%macro EDGE_EXTENSION 3 |
||||
mov %3, [tmpq] |
||||
%assign %%i 1 |
||||
%rep %1 |
||||
mov [tmpq-2*%%i], %3 |
||||
%assign %%i %%i+1 |
||||
%endrep |
||||
mov %3, [tmpq+2*w2q-2] |
||||
%assign %%i 0 |
||||
%rep %2 |
||||
mov [tmpq+2*w2q+2*%%i], %3 |
||||
%assign %%i %%i+1 |
||||
%endrep |
||||
%endmacro |
||||
|
||||
; On x86-64 this does a tail call to the C function to do the final bit |
||||
; x86-32 doesn't because isn't enough stack space for the additional argument x |
||||
%macro END_HORIZONTAL 1 |
||||
shr wd, 1 |
||||
%ifdef ARCH_X86_64 |
||||
RET ;This RET was a CLEANUP call |
||||
jmp %1 |
||||
%else |
||||
push xd |
||||
push wd |
||||
push tmpd |
||||
push bd |
||||
call %1 |
||||
add esp, 16 |
||||
RET |
||||
%endif |
||||
%endmacro |
||||
|
||||
%macro HAAR_HORIZONTAL 2 |
||||
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) |
||||
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 |
||||
mov w2d, wd |
||||
xor xd, xd |
||||
shr w2d, 1 |
||||
lea b_w2q, [bq+wq] |
||||
mova m3, [pw_1] |
||||
.lowpass_loop: |
||||
movu m1, [b_w2q + 2*xq] |
||||
mova m0, [bq + 2*xq] |
||||
paddw m1, m3 |
||||
psraw m1, 1 |
||||
psubw m0, m1 |
||||
mova [tmpq + 2*xq], m0 |
||||
add xd, mmsize/2 |
||||
cmp xd, w2d |
||||
jl .lowpass_loop |
||||
|
||||
xor xd, xd |
||||
and w2d, ~(mmsize/2 - 1) |
||||
cmp w2d, mmsize/2 |
||||
jl .end |
||||
|
||||
.highpass_loop: |
||||
mova m1, [b_w2q + 2*xq] |
||||
mova m0, [tmpq + 2*xq] |
||||
paddw m1, m0 |
||||
|
||||
; shift and interleave |
||||
%if %2 == 1 |
||||
paddw m0, m3 |
||||
paddw m1, m3 |
||||
psraw m0, 1 |
||||
psraw m1, 1 |
||||
%endif |
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
mova [bq+4*xq], m0 |
||||
mova [bq+4*xq+mmsize], m2 |
||||
|
||||
add xd, mmsize/2 |
||||
cmp xd, w2d |
||||
jl .highpass_loop |
||||
.end: |
||||
END_HORIZONTAL horizontal_compose_haar%2i_end_c |
||||
%endmacro |
||||
|
||||
|
||||
INIT_XMM |
||||
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) |
||||
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 |
||||
mov w2d, wd |
||||
xor xd, xd |
||||
shr w2d, 1 |
||||
lea b_w2q, [bq+wq] |
||||
movu m4, [bq+wq] |
||||
mova m7, [pw_2] |
||||
pslldq m4, 14 |
||||
.lowpass_loop: |
||||
movu m1, [b_w2q + 2*xq] |
||||
mova m0, [bq + 2*xq] |
||||
mova m2, m1 |
||||
palignr m1, m4, 14 |
||||
mova m4, m2 |
||||
COMPOSE_53iL0 m0, m1, m2, m7 |
||||
mova [tmpq + 2*xq], m0 |
||||
add xd, mmsize/2 |
||||
cmp xd, w2d |
||||
jl .lowpass_loop |
||||
|
||||
EDGE_EXTENSION 1, 2, xw |
||||
; leave the last up to 7 (sse) or 3 (mmx) values for C |
||||
xor xd, xd |
||||
and w2d, ~(mmsize/2 - 1) |
||||
cmp w2d, mmsize/2 |
||||
jl .end |
||||
|
||||
mova m7, [tmpq-mmsize] |
||||
mova m0, [tmpq] |
||||
mova m5, [pw_1] |
||||
mova m3, [pw_8] |
||||
mova m4, [pw_1991] |
||||
.highpass_loop: |
||||
mova m6, m0 |
||||
palignr m0, m7, 14 |
||||
mova m7, [tmpq + 2*xq + 16] |
||||
mova m1, m7 |
||||
mova m2, m7 |
||||
palignr m1, m6, 2 |
||||
palignr m2, m6, 4 |
||||
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] |
||||
mova m0, m7 |
||||
mova m7, m6 |
||||
|
||||
; shift and interleave |
||||
paddw m6, m5 |
||||
paddw m1, m5 |
||||
psraw m6, 1 |
||||
psraw m1, 1 |
||||
mova m2, m6 |
||||
punpcklwd m6, m1 |
||||
punpckhwd m2, m1 |
||||
mova [bq+4*xq], m6 |
||||
mova [bq+4*xq+mmsize], m2 |
||||
|
||||
add xd, mmsize/2 |
||||
cmp xd, w2d |
||||
jl .highpass_loop |
||||
.end: |
||||
END_HORIZONTAL horizontal_compose_dd97i_end_c |
||||
|
||||
|
||||
%ifndef ARCH_X86_64 |
||||
INIT_MMX |
||||
COMPOSE_VERTICAL mmx |
||||
HAAR_HORIZONTAL mmx, 0 |
||||
HAAR_HORIZONTAL mmx, 1 |
||||
%endif |
||||
|
||||
;;INIT_XMM |
||||
INIT_XMM |
||||
COMPOSE_VERTICAL sse2 |
||||
HAAR_HORIZONTAL sse2, 0 |
||||
HAAR_HORIZONTAL sse2, 1 |
Loading…
Reference in new issue