mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3439 lines
102 KiB
3439 lines
102 KiB
/////////////////////////////////////////////////////////////////////////// |
|
// |
|
// Copyright (c) 2009-2014 DreamWorks Animation LLC. |
|
// |
|
// All rights reserved. |
|
// |
|
// Redistribution and use in source and binary forms, with or without |
|
// modification, are permitted provided that the following conditions are |
|
// met: |
|
// * Redistributions of source code must retain the above copyright |
|
// notice, this list of conditions and the following disclaimer. |
|
// * Redistributions in binary form must reproduce the above |
|
// copyright notice, this list of conditions and the following disclaimer |
|
// in the documentation and/or other materials provided with the |
|
// distribution. |
|
// * Neither the name of DreamWorks Animation nor the names of |
|
// its contributors may be used to endorse or promote products derived |
|
// from this software without specific prior written permission. |
|
// |
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
// |
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
//--------------------------------------------------- |
|
// |
|
// class DwaCompressor -- Store lossy RGB data by quantizing |
|
// DCT components. |
|
// |
|
// First, we try and figure out what compression strategy to take |
|
// based in channel name. For RGB channels, we want a lossy method |
|
// described below. But, if we have alpha, we should do something |
|
// different (and probably using RLE). If we have depth, or velocity, |
|
// or something else, just fall back to ZIP. The rules for deciding |
|
// which strategy to use are setup in initializeDefaultChannelRules(). |
|
// When writing a file, the relevant rules needed to decode are written |
|
// into the start of the data block, making a self-contained file. |
|
// If initializeDefaultChannelRules() doesn't quite suite your naming |
|
// conventions, you can adjust the rules without breaking decoder |
|
// compatability. |
|
// |
|
// If we're going to lossy compress R, G, or B channels, it's easier |
|
// to toss bits in a more perceptual uniform space. One could argue |
|
// at length as to what constitutes perceptually uniform, expecially |
|
// when storing either scene/input/focal plane referred and output referred |
|
// data. |
|
// |
|
// We'll compromise. For values <= 1, we use a traditional power function |
|
// (without any of that straight-line business at the bottom). For values > 1, |
|
// we want something more like a log function, since power functions blow |
|
// up. At 1, we want a smooth blend between the functions. So, we use a |
|
// piecewise function that does just that - see dwaLookups.cpp for |
|
// a little more detail. |
|
// |
|
// Also, if we find that we have R, G, and B channels from the same layer, |
|
// we can get a bit more compression efficiency by transforming to a Y'CbCr |
|
// space. We use the 709 transform, but with Cb,Cr = 0 for an input of |
|
// (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point |
|
// makes no sense with large range data. Transforms are done to from |
|
// the perceptual space data, not the linear-light space data (R'G'B' -> |
|
// (Y'CbCr, not RGB -> YCbCr). |
|
// |
|
// Next, we forward DCT the data. This is done with a floating |
|
// point DCT, as we don't really have control over the src range. The |
|
// resulting values are dropped to half-float precision. |
|
// |
|
// Now, we need to quantize. Quantization departs from the usual way |
|
// of dividing and rounding. Instead, we start with some floating |
|
// point "base-error" value. From this, we can derive quantization |
|
// error for each DCT component. Take the standard JPEG quantization |
|
// tables and normalize them by the smallest value. Then, multiply |
|
// the normalized quant tables by our base-error value. This gives |
|
// a range of errors for each DCT component. |
|
// |
|
// For each DCT component, we want to find a quantized value that |
|
// is within +- the per-component error. Pick the quantized value |
|
// that has the fewest bits set in its' binary representation. |
|
// Brute-forcing the search would make for extremly inefficient |
|
// compression. Fortunatly, we can precompute a table to assist |
|
// with this search. |
|
// |
|
// For each 16-bit float value, there are at most 15 other values with |
|
// fewer bits set. We can precompute these values in a compact form, since |
|
// many source values have far fewer that 15 possible quantized values. |
|
// Now, instead of searching the entire range +- the component error, |
|
// we can just search at most 15 quantization candidates. The search can |
|
// be accelerated a bit more by sorting the candidates by the |
|
// number of bits set, in increasing order. Then, the search can stop |
|
// once a candidate is found w/i the per-component quantization |
|
// error range. |
|
// |
|
// The quantization strategy has the side-benefit that there is no |
|
// de-quantization step upon decode, so we don't bother recording |
|
// the quantization table. |
|
// |
|
// Ok. So we now have quantized values. Time for entropy coding. We |
|
// can use either static Huffman or zlib/DEFLATE. The static Huffman |
|
// is more efficient at compacting data, but can have a greater |
|
// overhead, especially for smaller tile/strip sizes. |
|
// |
|
// There is some additional fun, like ZIP compressing the DC components |
|
// instead of Huffman/zlib, which helps make things slightly smaller. |
|
// |
|
// Compression level is controlled by setting an int/float/double attribute |
|
// on the header named "dwaCompressionLevel". This is a thinly veiled name for |
|
// the "base-error" value mentioned above. The "base-error" is just |
|
// dwaCompressionLevel / 100000. The default value of 45.0 is generally |
|
// pretty good at generating "visually lossless" values at reasonable |
|
// data rates. Setting dwaCompressionLevel to 0 should result in no additional |
|
// quantization at the quantization stage (though there may be |
|
// quantization in practice at the CSC/DCT steps). But if you really |
|
// want lossless compression, there are pleanty of other choices |
|
// of compressors ;) |
|
// |
|
// When dealing with FLOAT source buffers, we first quantize the source |
|
// to HALF and continue down as we would for HALF source. |
|
// |
|
//--------------------------------------------------- |
|
|
|
|
|
#include "ImfDwaCompressor.h" |
|
#include "ImfDwaCompressorSimd.h" |
|
|
|
#include "ImfChannelList.h" |
|
#include "ImfStandardAttributes.h" |
|
#include "ImfHeader.h" |
|
#include "ImfHuf.h" |
|
#include "ImfInt64.h" |
|
#include "ImfIntAttribute.h" |
|
#include "ImfIO.h" |
|
#include "ImfMisc.h" |
|
#include "ImfNamespace.h" |
|
#include "ImfRle.h" |
|
#include "ImfSimd.h" |
|
#include "ImfSystemSpecific.h" |
|
#include "ImfXdr.h" |
|
#include "ImfZip.h" |
|
|
|
#include "ImathFun.h" |
|
#include "ImathBox.h" |
|
#include "ImathVec.h" |
|
#include "half.h" |
|
#include "halfLimits.h" |
|
|
|
#include "dwaLookups.h" |
|
|
|
#include <vector> |
|
#include <string> |
|
#include <cctype> |
|
#include <cassert> |
|
#include <algorithm> |
|
|
|
// Windows specific addition to prevent the indirect import of the redefined min/max macros |
|
#if defined _WIN32 || defined _WIN64 |
|
#ifdef NOMINMAX |
|
#undef NOMINMAX |
|
#endif |
|
#define NOMINMAX |
|
#endif |
|
#include <zlib.h> |
|
|
|
|
|
OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER |
|
|
|
|
|
namespace { |
|
|
|
// |
|
// Function pointer to dispatch to an approprate |
|
// convertFloatToHalf64_* impl, based on runtime cpu checking. |
|
// Should be initialized in DwaCompressor::initializeFuncs() |
|
// |
|
|
|
void (*convertFloatToHalf64)(unsigned short*, float*) = |
|
convertFloatToHalf64_scalar; |
|
|
|
// |
|
// Function pointer for dispatching a fromHalfZigZag_ impl |
|
// |
|
|
|
void (*fromHalfZigZag)(unsigned short*, float*) = |
|
fromHalfZigZag_scalar; |
|
|
|
// |
|
// Dispatch the inverse DCT on an 8x8 block, where the last |
|
// n rows can be all zeros. The n=0 case converts the full block. |
|
// |
|
void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>; |
|
void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>; |
|
void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>; |
|
void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>; |
|
void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>; |
|
void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>; |
|
void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>; |
|
void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>; |
|
|
|
} // namespace |
|
|
|
|
|
struct DwaCompressor::ChannelData |
|
{ |
|
std::string name; |
|
CompressorScheme compression; |
|
int xSampling; |
|
int ySampling; |
|
PixelType type; |
|
bool pLinear; |
|
|
|
int width; |
|
int height; |
|
|
|
// |
|
// Incoming and outgoing data is scanline interleaved, and it's much |
|
// easier to operate on contiguous data. Assuming the planare unc |
|
// buffer is to hold RLE data, we need to rearrange to make bytes |
|
// adjacent. |
|
// |
|
|
|
char *planarUncBuffer; |
|
char *planarUncBufferEnd; |
|
|
|
char *planarUncRle[4]; |
|
char *planarUncRleEnd[4]; |
|
|
|
PixelType planarUncType; |
|
int planarUncSize; |
|
}; |
|
|
|
|
|
struct DwaCompressor::CscChannelSet |
|
{ |
|
int idx[3]; |
|
}; |
|
|
|
|
|
struct DwaCompressor::Classifier |
|
{ |
|
Classifier (std::string suffix, |
|
CompressorScheme scheme, |
|
PixelType type, |
|
int cscIdx, |
|
bool caseInsensitive): |
|
_suffix(suffix), |
|
_scheme(scheme), |
|
_type(type), |
|
_cscIdx(cscIdx), |
|
_caseInsensitive(caseInsensitive) |
|
{ |
|
if (caseInsensitive) |
|
std::transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower); |
|
} |
|
|
|
Classifier (const char *&ptr, int size) |
|
{ |
|
if (size <= 0) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (truncated rule)."); |
|
|
|
{ |
|
char suffix[Name::SIZE]; |
|
memset (suffix, 0, Name::SIZE); |
|
Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix); |
|
_suffix = std::string(suffix); |
|
} |
|
|
|
if (size < _suffix.length() + 1 + 2*Xdr::size<char>()) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (truncated rule)."); |
|
|
|
char value; |
|
Xdr::read<CharPtrIO> (ptr, value); |
|
|
|
_cscIdx = (int)(value >> 4) - 1; |
|
if (_cscIdx < -1 || _cscIdx >= 3) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (corrupt cscIdx rule)."); |
|
|
|
_scheme = (CompressorScheme)((value >> 2) & 3); |
|
if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (corrupt scheme rule)."); |
|
|
|
_caseInsensitive = (value & 1 ? true : false); |
|
|
|
Xdr::read<CharPtrIO> (ptr, value); |
|
if (value < 0 || value >= NUM_PIXELTYPES) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (corrupt rule)."); |
|
_type = (PixelType)value; |
|
} |
|
|
|
bool match (const std::string &suffix, const PixelType type) const |
|
{ |
|
if (_type != type) return false; |
|
|
|
if (_caseInsensitive) |
|
{ |
|
std::string tmp(suffix); |
|
std::transform(tmp.begin(), tmp.end(), tmp.begin(), tolower); |
|
return tmp == _suffix; |
|
} |
|
|
|
return suffix == _suffix; |
|
} |
|
|
|
size_t size () const |
|
{ |
|
// string length + \0 |
|
size_t sizeBytes = _suffix.length() + 1; |
|
|
|
// 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type |
|
sizeBytes += 2 * Xdr::size<char>(); |
|
|
|
return sizeBytes; |
|
} |
|
|
|
void write (char *&ptr) const |
|
{ |
|
Xdr::write<CharPtrIO> (ptr, _suffix.c_str()); |
|
|
|
// Encode _cscIdx (-1-3) in the upper 4 bits, |
|
// _scheme (0-2) in the next 2 bits |
|
// _caseInsen in the bottom bit |
|
unsigned char value = 0; |
|
value |= ((unsigned char)(_cscIdx+1) & 15) << 4; |
|
value |= ((unsigned char)_scheme & 3) << 2; |
|
value |= (unsigned char)_caseInsensitive & 1; |
|
|
|
Xdr::write<CharPtrIO> (ptr, value); |
|
Xdr::write<CharPtrIO> (ptr, (unsigned char)_type); |
|
} |
|
|
|
std::string _suffix; |
|
CompressorScheme _scheme; |
|
PixelType _type; |
|
int _cscIdx; |
|
bool _caseInsensitive; |
|
}; |
|
|
|
|
|
// |
|
// Base class for the LOSSY_DCT decoder classes |
|
// |
|
|
|
class DwaCompressor::LossyDctDecoderBase |
|
{ |
|
public: |
|
|
|
LossyDctDecoderBase |
|
(char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toLinear, |
|
int width, |
|
int height); |
|
|
|
virtual ~LossyDctDecoderBase (); |
|
|
|
void execute(); |
|
|
|
// |
|
// These return number of items, not bytes. Each item |
|
// is an unsigned short |
|
// |
|
|
|
int numAcValuesEncoded() const { return _packedAcCount; } |
|
int numDcValuesEncoded() const { return _packedDcCount; } |
|
|
|
protected: |
|
|
|
// |
|
// Un-RLE the packed AC components into |
|
// a half buffer. The half block should |
|
// be the full 8x8 block (in zig-zag order |
|
// still), not the first AC component. |
|
// |
|
// currAcComp is advanced as bytes are decoded. |
|
// |
|
// This returns the index of the last non-zero |
|
// value in the buffer - with the index into zig zag |
|
// order data. If we return 0, we have DC only data. |
|
// |
|
|
|
int unRleAc (unsigned short *&currAcComp, |
|
unsigned short *halfZigBlock); |
|
|
|
|
|
// |
|
// if NATIVE and XDR are really the same values, we can |
|
// skip some processing and speed things along |
|
// |
|
|
|
bool _isNativeXdr; |
|
|
|
|
|
// |
|
// Counts of how many items have been packed into the |
|
// AC and DC buffers |
|
// |
|
|
|
int _packedAcCount; |
|
int _packedDcCount; |
|
|
|
|
|
// |
|
// AC and DC buffers to pack |
|
// |
|
|
|
char *_packedAc; |
|
char *_packedDc; |
|
|
|
|
|
// |
|
// half -> half LUT to transform from nonlinear to linear |
|
// |
|
|
|
const unsigned short *_toLinear; |
|
|
|
|
|
// |
|
// image dimensions |
|
// |
|
|
|
int _width; |
|
int _height; |
|
|
|
|
|
// |
|
// Pointers to the start of each scanlines, to be filled on decode |
|
// Generally, these will be filled by the subclasses. |
|
// |
|
|
|
std::vector< std::vector<char *> > _rowPtrs; |
|
|
|
|
|
// |
|
// The type of each data that _rowPtrs[i] is referring. Layout |
|
// is in the same order as _rowPtrs[]. |
|
// |
|
|
|
std::vector<PixelType> _type; |
|
std::vector<SimdAlignedBuffer64f> _dctData; |
|
}; |
|
|
|
|
|
// |
|
// Used to decode a single channel of LOSSY_DCT data. |
|
// |
|
|
|
class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase |
|
{ |
|
public: |
|
|
|
// |
|
// toLinear is a half-float LUT to convert the encoded values |
|
// back to linear light. If you want to skip this step, pass |
|
// in NULL here. |
|
// |
|
|
|
LossyDctDecoder |
|
(std::vector<char *> &rowPtrs, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toLinear, |
|
int width, |
|
int height, |
|
PixelType type) |
|
: |
|
LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height) |
|
{ |
|
_rowPtrs.push_back(rowPtrs); |
|
_type.push_back(type); |
|
} |
|
|
|
virtual ~LossyDctDecoder () {} |
|
}; |
|
|
|
|
|
// |
|
// Used to decode 3 channels of LOSSY_DCT data that |
|
// are grouped together and color space converted. |
|
// |
|
|
|
class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase |
|
{ |
|
public: |
|
|
|
// |
|
// toLinear is a half-float LUT to convert the encoded values |
|
// back to linear light. If you want to skip this step, pass |
|
// in NULL here. |
|
// |
|
|
|
LossyDctDecoderCsc |
|
(std::vector<char *> &rowPtrsR, |
|
std::vector<char *> &rowPtrsG, |
|
std::vector<char *> &rowPtrsB, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toLinear, |
|
int width, |
|
int height, |
|
PixelType typeR, |
|
PixelType typeG, |
|
PixelType typeB) |
|
: |
|
LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height) |
|
{ |
|
_rowPtrs.push_back(rowPtrsR); |
|
_rowPtrs.push_back(rowPtrsG); |
|
_rowPtrs.push_back(rowPtrsB); |
|
_type.push_back(typeR); |
|
_type.push_back(typeG); |
|
_type.push_back(typeB); |
|
} |
|
|
|
virtual ~LossyDctDecoderCsc () {} |
|
}; |
|
|
|
|
|
// |
|
// Base class for encoding using the lossy DCT scheme |
|
// |
|
|
|
class DwaCompressor::LossyDctEncoderBase |
|
{ |
|
public: |
|
|
|
LossyDctEncoderBase |
|
(float quantBaseError, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toNonlinear, |
|
int width, |
|
int height); |
|
|
|
virtual ~LossyDctEncoderBase (); |
|
|
|
void execute (); |
|
|
|
// |
|
// These return number of items, not bytes. Each item |
|
// is an unsigned short |
|
// |
|
|
|
int numAcValuesEncoded () const {return _numAcComp;} |
|
int numDcValuesEncoded () const {return _numDcComp;} |
|
|
|
protected: |
|
|
|
void toZigZag (half *dst, half *src); |
|
int countSetBits (unsigned short src); |
|
half quantize (half src, float errorTolerance); |
|
void rleAc (half *block, unsigned short *&acPtr); |
|
|
|
float _quantBaseError; |
|
|
|
int _width, |
|
_height; |
|
const unsigned short *_toNonlinear; |
|
|
|
int _numAcComp, |
|
_numDcComp; |
|
|
|
std::vector< std::vector<const char *> > _rowPtrs; |
|
std::vector<PixelType> _type; |
|
std::vector<SimdAlignedBuffer64f> _dctData; |
|
|
|
|
|
// |
|
// Pointers to the buffers where AC and DC |
|
// DCT components should be packed for |
|
// lossless compression downstream |
|
// |
|
|
|
char *_packedAc; |
|
char *_packedDc; |
|
|
|
|
|
// |
|
// Our "quantization tables" - the example JPEG tables, |
|
// normalized so that the smallest value in each is 1.0. |
|
// This gives us a relationship between error in DCT |
|
// components |
|
// |
|
|
|
float _quantTableY[64]; |
|
float _quantTableCbCr[64]; |
|
}; |
|
|
|
|
|
|
|
// |
|
// Single channel lossy DCT encoder |
|
// |
|
|
|
class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase |
|
{ |
|
public: |
|
|
|
LossyDctEncoder |
|
(float quantBaseError, |
|
std::vector<const char *> &rowPtrs, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toNonlinear, |
|
int width, |
|
int height, |
|
PixelType type) |
|
: |
|
LossyDctEncoderBase |
|
(quantBaseError, packedAc, packedDc, toNonlinear, width, height) |
|
{ |
|
_rowPtrs.push_back(rowPtrs); |
|
_type.push_back(type); |
|
} |
|
|
|
virtual ~LossyDctEncoder () {} |
|
}; |
|
|
|
|
|
// |
|
// RGB channel lossy DCT encoder |
|
// |
|
|
|
class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase |
|
{ |
|
public: |
|
|
|
LossyDctEncoderCsc |
|
(float quantBaseError, |
|
std::vector<const char *> &rowPtrsR, |
|
std::vector<const char *> &rowPtrsG, |
|
std::vector<const char *> &rowPtrsB, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toNonlinear, |
|
int width, |
|
int height, |
|
PixelType typeR, |
|
PixelType typeG, |
|
PixelType typeB) |
|
: |
|
LossyDctEncoderBase |
|
(quantBaseError, packedAc, packedDc, toNonlinear, width, height) |
|
{ |
|
_type.push_back(typeR); |
|
_type.push_back(typeG); |
|
_type.push_back(typeB); |
|
|
|
_rowPtrs.push_back(rowPtrsR); |
|
_rowPtrs.push_back(rowPtrsG); |
|
_rowPtrs.push_back(rowPtrsB); |
|
} |
|
|
|
virtual ~LossyDctEncoderCsc () {} |
|
}; |
|
|
|
|
|
// ============================================================== |
|
// |
|
// LossyDctDecoderBase |
|
// |
|
// -------------------------------------------------------------- |
|
|
|
DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase |
|
(char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toLinear, |
|
int width, |
|
int height) |
|
: |
|
_isNativeXdr(false), |
|
_packedAcCount(0), |
|
_packedDcCount(0), |
|
_packedAc(packedAc), |
|
_packedDc(packedDc), |
|
_toLinear(toLinear), |
|
_width(width), |
|
_height(height) |
|
{ |
|
if (_toLinear == 0) |
|
_toLinear = get_dwaCompressorNoOp(); |
|
|
|
_isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN; |
|
} |
|
|
|
|
|
DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {} |
|
|
|
|
|
void |
|
DwaCompressor::LossyDctDecoderBase::execute () |
|
{ |
|
int numComp = _rowPtrs.size(); |
|
int lastNonZero = 0; |
|
int numBlocksX = (int) ceil ((float)_width / 8.0f); |
|
int numBlocksY = (int) ceil ((float)_height / 8.0f); |
|
int leftoverX = _width - (numBlocksX-1) * 8; |
|
int leftoverY = _height - (numBlocksY-1) * 8; |
|
|
|
int numFullBlocksX = (int)floor ((float)_width / 8.0f); |
|
|
|
unsigned short tmpShortNative = 0; |
|
unsigned short tmpShortXdr = 0; |
|
const char *tmpConstCharPtr = 0; |
|
|
|
unsigned short *currAcComp = (unsigned short *)_packedAc; |
|
std::vector<unsigned short *> currDcComp (_rowPtrs.size()); |
|
std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size()); |
|
|
|
if (_type.size() != _rowPtrs.size()) |
|
throw IEX_NAMESPACE::BaseExc ("Row pointers and types mismatch in count"); |
|
|
|
if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1)) |
|
throw IEX_NAMESPACE::NoImplExc ("Only 1 and 3 channel encoding is supported"); |
|
|
|
_dctData.resize(numComp); |
|
|
|
// |
|
// Allocate a temp aligned buffer to hold a rows worth of full |
|
// 8x8 half-float blocks |
|
// |
|
|
|
unsigned char *rowBlockHandle = new unsigned char |
|
[numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT]; |
|
|
|
unsigned short *rowBlock[3]; |
|
|
|
rowBlock[0] = (unsigned short*)rowBlockHandle; |
|
|
|
for (int i = 0; i < _SSE_ALIGNMENT; ++i) |
|
{ |
|
if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0) |
|
rowBlock[0] = (unsigned short *)(rowBlockHandle + i); |
|
} |
|
|
|
for (int comp = 1; comp < numComp; ++comp) |
|
rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64; |
|
|
|
// |
|
// Pack DC components together by common plane, so we can get |
|
// a little more out of differencing them. We'll always have |
|
// one component per block, so we can computed offsets. |
|
// |
|
|
|
currDcComp[0] = (unsigned short *)_packedDc; |
|
|
|
for (unsigned int comp = 1; comp < numComp; ++comp) |
|
currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY; |
|
|
|
for (int blocky = 0; blocky < numBlocksY; ++blocky) |
|
{ |
|
int maxY = 8; |
|
|
|
if (blocky == numBlocksY-1) |
|
maxY = leftoverY; |
|
|
|
int maxX = 8; |
|
|
|
for (int blockx = 0; blockx < numBlocksX; ++blockx) |
|
{ |
|
if (blockx == numBlocksX-1) |
|
maxX = leftoverX; |
|
|
|
// |
|
// If we can detect that the block is constant values |
|
// (all components only have DC values, and all AC is 0), |
|
// we can do everything only on 1 value, instead of all |
|
// 64. |
|
// |
|
// This won't really help for regular images, but it is |
|
// meant more for layers with large swaths of black |
|
// |
|
|
|
bool blockIsConstant = true; |
|
|
|
for (unsigned int comp = 0; comp < numComp; ++comp) |
|
{ |
|
|
|
// |
|
// DC component is stored separately |
|
// |
|
|
|
#ifdef IMF_HAVE_SSE2 |
|
{ |
|
__m128i *dst = (__m128i*)halfZigBlock[comp]._buffer; |
|
|
|
dst[7] = _mm_setzero_si128(); |
|
dst[6] = _mm_setzero_si128(); |
|
dst[5] = _mm_setzero_si128(); |
|
dst[4] = _mm_setzero_si128(); |
|
dst[3] = _mm_setzero_si128(); |
|
dst[2] = _mm_setzero_si128(); |
|
dst[1] = _mm_setzero_si128(); |
|
dst[0] = _mm_insert_epi16 |
|
(_mm_setzero_si128(), *currDcComp[comp]++, 0); |
|
} |
|
#else /* IMF_HAVE_SSE2 */ |
|
|
|
memset (halfZigBlock[comp]._buffer, 0, 64 * 2); |
|
halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++; |
|
|
|
#endif /* IMF_HAVE_SSE2 */ |
|
|
|
_packedDcCount++; |
|
|
|
// |
|
// UnRLE the AC. This will modify currAcComp |
|
// |
|
|
|
lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer); |
|
|
|
// |
|
// Convert from XDR to NATIVE |
|
// |
|
|
|
if (!_isNativeXdr) |
|
{ |
|
for (int i = 0; i < 64; ++i) |
|
{ |
|
tmpShortXdr = halfZigBlock[comp]._buffer[i]; |
|
tmpConstCharPtr = (const char *)&tmpShortXdr; |
|
|
|
Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative); |
|
|
|
halfZigBlock[comp]._buffer[i] = tmpShortNative; |
|
} |
|
} |
|
|
|
if (lastNonZero == 0) |
|
{ |
|
// |
|
// DC only case - AC components are all 0 |
|
// |
|
|
|
half h; |
|
|
|
h.setBits (halfZigBlock[comp]._buffer[0]); |
|
_dctData[comp]._buffer[0] = (float)h; |
|
|
|
dctInverse8x8DcOnly (_dctData[comp]._buffer); |
|
} |
|
else |
|
{ |
|
// |
|
// We have some AC components that are non-zero. |
|
// Can't use the 'constant block' optimization |
|
// |
|
|
|
blockIsConstant = false; |
|
|
|
// |
|
// Un-Zig zag |
|
// |
|
|
|
(*fromHalfZigZag) |
|
(halfZigBlock[comp]._buffer, _dctData[comp]._buffer); |
|
|
|
// |
|
// Zig-Zag indices in normal layout are as follows: |
|
// |
|
// 0 1 5 6 14 15 27 28 |
|
// 2 4 7 13 16 26 29 42 |
|
// 3 8 12 17 25 30 41 43 |
|
// 9 11 18 24 31 40 44 53 |
|
// 10 19 23 32 39 45 52 54 |
|
// 20 22 33 38 46 51 55 60 |
|
// 21 34 37 47 50 56 59 61 |
|
// 35 36 48 49 57 58 62 63 |
|
// |
|
// If lastNonZero is less than the first item on |
|
// each row, we know that the whole row is zero and |
|
// can be skipped in the row-oriented part of the |
|
// iDCT. |
|
// |
|
// The unrolled logic here is: |
|
// |
|
// if lastNonZero < rowStartIdx[i], |
|
// zeroedRows = rowsEmpty[i] |
|
// |
|
// where: |
|
// |
|
// const int rowStartIdx[] = {2, 3, 9, 10, 20, 21, 35}; |
|
// const int rowsEmpty[] = {7, 6, 5, 4, 3, 2, 1}; |
|
// |
|
|
|
if (lastNonZero < 2) |
|
dctInverse8x8_7(_dctData[comp]._buffer); |
|
else if (lastNonZero < 3) |
|
dctInverse8x8_6(_dctData[comp]._buffer); |
|
else if (lastNonZero < 9) |
|
dctInverse8x8_5(_dctData[comp]._buffer); |
|
else if (lastNonZero < 10) |
|
dctInverse8x8_4(_dctData[comp]._buffer); |
|
else if (lastNonZero < 20) |
|
dctInverse8x8_3(_dctData[comp]._buffer); |
|
else if (lastNonZero < 21) |
|
dctInverse8x8_2(_dctData[comp]._buffer); |
|
else if (lastNonZero < 35) |
|
dctInverse8x8_1(_dctData[comp]._buffer); |
|
else |
|
dctInverse8x8_0(_dctData[comp]._buffer); |
|
} |
|
} |
|
|
|
// |
|
// Perform the CSC |
|
// |
|
|
|
if (numComp == 3) |
|
{ |
|
if (!blockIsConstant) |
|
{ |
|
csc709Inverse64 (_dctData[0]._buffer, |
|
_dctData[1]._buffer, |
|
_dctData[2]._buffer); |
|
|
|
} |
|
else |
|
{ |
|
csc709Inverse (_dctData[0]._buffer[0], |
|
_dctData[1]._buffer[0], |
|
_dctData[2]._buffer[0]); |
|
} |
|
} |
|
|
|
// |
|
// Float -> Half conversion. |
|
// |
|
// If the block has a constant value, just convert the first pixel. |
|
// |
|
|
|
for (unsigned int comp = 0; comp < numComp; ++comp) |
|
{ |
|
if (!blockIsConstant) |
|
{ |
|
(*convertFloatToHalf64) |
|
(&rowBlock[comp][blockx*64], _dctData[comp]._buffer); |
|
} |
|
else |
|
{ |
|
#ifdef IMF_HAVE_SSE2 |
|
|
|
__m128i *dst = (__m128i*)&rowBlock[comp][blockx*64]; |
|
|
|
dst[0] = _mm_set1_epi16 |
|
(((half)_dctData[comp]._buffer[0]).bits()); |
|
|
|
dst[1] = dst[0]; |
|
dst[2] = dst[0]; |
|
dst[3] = dst[0]; |
|
dst[4] = dst[0]; |
|
dst[5] = dst[0]; |
|
dst[6] = dst[0]; |
|
dst[7] = dst[0]; |
|
|
|
#else /* IMF_HAVE_SSE2 */ |
|
|
|
unsigned short *dst = &rowBlock[comp][blockx*64]; |
|
|
|
dst[0] = ((half)_dctData[comp]._buffer[0]).bits(); |
|
|
|
for (int i = 1; i < 64; ++i) |
|
{ |
|
dst[i] = dst[0]; |
|
} |
|
|
|
#endif /* IMF_HAVE_SSE2 */ |
|
} // blockIsConstant |
|
} // comp |
|
} // blockx |
|
|
|
// |
|
// At this point, we have half-float nonlinear value blocked |
|
// in rowBlock[][]. We need to unblock the data, transfer |
|
// back to linear, and write the results in the _rowPtrs[]. |
|
// |
|
// There is a fast-path for aligned rows, which helps |
|
// things a little. Since this fast path is only valid |
|
// for full 8-element wide blocks, the partial x blocks |
|
// are broken into a separate loop below. |
|
// |
|
// At the moment, the fast path requires: |
|
// * sse support |
|
// * aligned row pointers |
|
// * full 8-element wide blocks |
|
// |
|
|
|
for (int comp = 0; comp < numComp; ++comp) |
|
{ |
|
// |
|
// Test if we can use the fast path |
|
// |
|
|
|
#ifdef IMF_HAVE_SSE2 |
|
|
|
bool fastPath = true; |
|
|
|
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y) |
|
{ |
|
if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK) |
|
fastPath = false; |
|
} |
|
|
|
if (fastPath) |
|
{ |
|
// |
|
// Handle all the full X blocks, in a fast path with sse2 and |
|
// aligned row pointers |
|
// |
|
|
|
for (int y=8*blocky; y<8*blocky+maxY; ++y) |
|
{ |
|
__m128i *dst = (__m128i *)_rowPtrs[comp][y]; |
|
__m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8]; |
|
|
|
|
|
for (int blockx = 0; blockx < numFullBlocksX; ++blockx) |
|
{ |
|
// |
|
// These may need some twiddling. |
|
// Run with multiples of 8 |
|
// |
|
|
|
_mm_prefetch ((char *)(src + 16), _MM_HINT_NTA); |
|
|
|
unsigned short i0 = _mm_extract_epi16 (*src, 0); |
|
unsigned short i1 = _mm_extract_epi16 (*src, 1); |
|
unsigned short i2 = _mm_extract_epi16 (*src, 2); |
|
unsigned short i3 = _mm_extract_epi16 (*src, 3); |
|
|
|
unsigned short i4 = _mm_extract_epi16 (*src, 4); |
|
unsigned short i5 = _mm_extract_epi16 (*src, 5); |
|
unsigned short i6 = _mm_extract_epi16 (*src, 6); |
|
unsigned short i7 = _mm_extract_epi16 (*src, 7); |
|
|
|
i0 = _toLinear[i0]; |
|
i1 = _toLinear[i1]; |
|
i2 = _toLinear[i2]; |
|
i3 = _toLinear[i3]; |
|
|
|
i4 = _toLinear[i4]; |
|
i5 = _toLinear[i5]; |
|
i6 = _toLinear[i6]; |
|
i7 = _toLinear[i7]; |
|
|
|
*dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0); |
|
*dst = _mm_insert_epi16 (*dst, i1, 1); |
|
*dst = _mm_insert_epi16 (*dst, i2, 2); |
|
*dst = _mm_insert_epi16 (*dst, i3, 3); |
|
|
|
*dst = _mm_insert_epi16 (*dst, i4, 4); |
|
*dst = _mm_insert_epi16 (*dst, i5, 5); |
|
*dst = _mm_insert_epi16 (*dst, i6, 6); |
|
*dst = _mm_insert_epi16 (*dst, i7, 7); |
|
|
|
src += 8; |
|
dst++; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
|
|
#endif /* IMF_HAVE_SSE2 */ |
|
|
|
// |
|
// Basic scalar kinda slow path for handling the full X blocks |
|
// |
|
|
|
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y) |
|
{ |
|
unsigned short *dst = (unsigned short *)_rowPtrs[comp][y]; |
|
|
|
for (int blockx = 0; blockx < numFullBlocksX; ++blockx) |
|
{ |
|
unsigned short *src = |
|
&rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)]; |
|
|
|
dst[0] = _toLinear[src[0]]; |
|
dst[1] = _toLinear[src[1]]; |
|
dst[2] = _toLinear[src[2]]; |
|
dst[3] = _toLinear[src[3]]; |
|
|
|
dst[4] = _toLinear[src[4]]; |
|
dst[5] = _toLinear[src[5]]; |
|
dst[6] = _toLinear[src[6]]; |
|
dst[7] = _toLinear[src[7]]; |
|
|
|
dst += 8; |
|
} |
|
} |
|
|
|
#ifdef IMF_HAVE_SSE2 |
|
|
|
} |
|
|
|
#endif /* IMF_HAVE_SSE2 */ |
|
|
|
// |
|
// If we have partial X blocks, deal with all those now |
|
// Since this should be minimal work, there currently |
|
// is only one path that should work for everyone. |
|
// |
|
|
|
if (numFullBlocksX != numBlocksX) |
|
{ |
|
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y) |
|
{ |
|
unsigned short *src = (unsigned short *) |
|
&rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)]; |
|
|
|
unsigned short *dst = (unsigned short *)_rowPtrs[comp][y]; |
|
|
|
dst += 8 * numFullBlocksX; |
|
|
|
for (int x = 0; x < maxX; ++x) |
|
{ |
|
*dst++ = _toLinear[*src++]; |
|
} |
|
} |
|
} |
|
} // comp |
|
} // blocky |
|
|
|
// |
|
// Walk over all the channels that are of type FLOAT. |
|
// Convert from HALF XDR back to FLOAT XDR. |
|
// |
|
|
|
for (unsigned int chan = 0; chan < numComp; ++chan) |
|
{ |
|
|
|
if (_type[chan] != FLOAT) |
|
continue; |
|
|
|
std::vector<unsigned short> halfXdr (_width); |
|
|
|
for (int y=0; y<_height; ++y) |
|
{ |
|
char *floatXdrPtr = _rowPtrs[chan][y]; |
|
|
|
memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short)); |
|
|
|
const char *halfXdrPtr = (const char *)(&halfXdr[0]); |
|
|
|
for (int x=0; x<_width; ++x) |
|
{ |
|
half tmpHalf; |
|
|
|
Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf); |
|
Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf); |
|
|
|
// |
|
// Xdr::write and Xdr::read will advance the ptrs |
|
// |
|
} |
|
} |
|
} |
|
|
|
delete[] rowBlockHandle; |
|
} |
|
|
|
|
|
// |
|
// Un-RLE the packed AC components into |
|
// a half buffer. The half block should |
|
// be the full 8x8 block (in zig-zag order |
|
// still), not the first AC component. |
|
// |
|
// currAcComp is advanced as bytes are decoded. |
|
// |
|
// This returns the index of the last non-zero |
|
// value in the buffer - with the index into zig zag |
|
// order data. If we return 0, we have DC only data. |
|
// |
|
// This is assuminging that halfZigBlock is zero'ed |
|
// prior to calling |
|
// |
|
|
|
int |
|
DwaCompressor::LossyDctDecoderBase::unRleAc |
|
(unsigned short *&currAcComp, |
|
unsigned short *halfZigBlock) |
|
{ |
|
// |
|
// Un-RLE the RLE'd blocks. If we find an item whose |
|
// high byte is 0xff, then insert the number of 0's |
|
// as indicated by the low byte. |
|
// |
|
// Otherwise, just copy the number verbaitm. |
|
// |
|
|
|
int lastNonZero = 0; |
|
int dctComp = 1; |
|
|
|
// |
|
// Start with a zero'ed block, so we don't have to |
|
// write when we hit a run symbol |
|
// |
|
|
|
while (dctComp < 64) |
|
{ |
|
if (*currAcComp == 0xff00) |
|
{ |
|
// |
|
// End of block |
|
// |
|
|
|
dctComp = 64; |
|
|
|
} |
|
else if ((*currAcComp) >> 8 == 0xff) |
|
{ |
|
// |
|
// Run detected! Insert 0's. |
|
// |
|
// Since the block has been zeroed, just advance the ptr |
|
// |
|
|
|
dctComp += (*currAcComp) & 0xff; |
|
} |
|
else |
|
{ |
|
// |
|
// Not a run, just copy over the value |
|
// |
|
|
|
lastNonZero = dctComp; |
|
halfZigBlock[dctComp] = *currAcComp; |
|
|
|
dctComp++; |
|
} |
|
|
|
_packedAcCount++; |
|
currAcComp++; |
|
} |
|
|
|
return lastNonZero; |
|
} |
|
|
|
|
|
// ============================================================== |
|
// |
|
// LossyDctEncoderBase |
|
// |
|
// -------------------------------------------------------------- |
|
|
|
DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase |
|
(float quantBaseError, |
|
char *packedAc, |
|
char *packedDc, |
|
const unsigned short *toNonlinear, |
|
int width, |
|
int height) |
|
: |
|
_quantBaseError(quantBaseError), |
|
_width(width), |
|
_height(height), |
|
_toNonlinear(toNonlinear), |
|
_numAcComp(0), |
|
_numDcComp(0), |
|
_packedAc(packedAc), |
|
_packedDc(packedDc) |
|
{ |
|
// |
|
// Here, we take the generic JPEG quantization tables and |
|
// normalize them by the smallest component in each table. |
|
// This gives us a relationship amongst the DCT components, |
|
// in terms of how sensitive each component is to |
|
// error. |
|
// |
|
// A higher normalized value means we can quantize more, |
|
// and a small normalized value means we can quantize less. |
|
// |
|
// Eventually, we will want an acceptable quantization |
|
// error range for each component. We find this by |
|
// multiplying some user-specified level (_quantBaseError) |
|
// by the normalized table (_quantTableY, _quantTableCbCr) to |
|
// find the acceptable quantization error range. |
|
// |
|
// The quantization table is not needed for decoding, and |
|
// is not transmitted. So, if you want to get really fancy, |
|
// you could derive some content-dependent quantization |
|
// table, and the decoder would not need to be changed. But, |
|
// for now, we'll just use statice quantization tables. |
|
// |
|
|
|
int jpegQuantTableY[] = |
|
{ |
|
16, 11, 10, 16, 24, 40, 51, 61, |
|
12, 12, 14, 19, 26, 58, 60, 55, |
|
14, 13, 16, 24, 40, 57, 69, 56, |
|
14, 17, 22, 29, 51, 87, 80, 62, |
|
18, 22, 37, 56, 68, 109, 103, 77, |
|
24, 35, 55, 64, 81, 104, 113, 92, |
|
49, 64, 78, 87, 103, 121, 120, 101, |
|
72, 92, 95, 98, 112, 100, 103, 99 |
|
}; |
|
|
|
int jpegQuantTableYMin = 10; |
|
|
|
int jpegQuantTableCbCr[] = |
|
{ |
|
17, 18, 24, 47, 99, 99, 99, 99, |
|
18, 21, 26, 66, 99, 99, 99, 99, |
|
24, 26, 56, 99, 99, 99, 99, 99, |
|
47, 66, 99, 99, 99, 99, 99, 99, |
|
99, 99, 99, 99, 99, 99, 99, 99, |
|
99, 99, 99, 99, 99, 99, 99, 99, |
|
99, 99, 99, 99, 99, 99, 99, 99, |
|
99, 99, 99, 99, 99, 99, 99, 99 |
|
}; |
|
|
|
int jpegQuantTableCbCrMin = 17; |
|
|
|
for (int idx = 0; idx < 64; ++idx) |
|
{ |
|
_quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) / |
|
static_cast<float> (jpegQuantTableYMin); |
|
|
|
_quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) / |
|
static_cast<float> (jpegQuantTableCbCrMin); |
|
} |
|
|
|
if (_quantBaseError < 0) |
|
quantBaseError = 0; |
|
} |
|
|
|
|
|
DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase () |
|
{ |
|
} |
|
|
|
|
|
// |
|
// Given three channels of source data, encoding by first applying |
|
// a color space conversion to a YCbCr space. Otherwise, if we only |
|
// have one channel, just encode it as is. |
|
// |
|
// Other numbers of channels are somewhat unexpected at this point, |
|
// and will throw an exception. |
|
// |
|
|
|
void |
|
DwaCompressor::LossyDctEncoderBase::execute () |
|
{ |
|
int numBlocksX = (int)ceil ((float)_width / 8.0f); |
|
int numBlocksY = (int)ceil ((float)_height/ 8.0f); |
|
|
|
half halfZigCoef[64]; |
|
half halfCoef[64]; |
|
|
|
std::vector<unsigned short *> currDcComp (_rowPtrs.size()); |
|
unsigned short *currAcComp = (unsigned short *)_packedAc; |
|
|
|
_dctData.resize (_rowPtrs.size()); |
|
_numAcComp = 0; |
|
_numDcComp = 0; |
|
|
|
assert (_type.size() == _rowPtrs.size()); |
|
assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1)); |
|
|
|
// |
|
// Allocate a temp half buffer to quantize into for |
|
// any FLOAT source channels. |
|
// |
|
|
|
int tmpHalfBufferElements = 0; |
|
|
|
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan) |
|
if (_type[chan] == FLOAT) |
|
tmpHalfBufferElements += _width * _height; |
|
|
|
std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements); |
|
|
|
char *tmpHalfBufferPtr = 0; |
|
|
|
if (tmpHalfBufferElements) |
|
tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0]; |
|
|
|
// |
|
// Run over all the float scanlines, quantizing, |
|
// and re-assigning _rowPtr[y]. We need to translate |
|
// FLOAT XDR to HALF XDR. |
|
// |
|
|
|
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan) |
|
{ |
|
if (_type[chan] != FLOAT) |
|
continue; |
|
|
|
for (int y = 0; y < _height; ++y) |
|
{ |
|
float src = 0; |
|
const char *srcXdr = _rowPtrs[chan][y]; |
|
char *dstXdr = tmpHalfBufferPtr; |
|
|
|
for (int x = 0; x < _width; ++x) |
|
{ |
|
|
|
Xdr::read<CharPtrIO> (srcXdr, src); |
|
|
|
// |
|
// Clamp to half ranges, instead of just casting. This |
|
// avoids introducing Infs which end up getting zeroed later |
|
// |
|
src = std::max ( |
|
std::min ((float) std::numeric_limits<half>::max(), src), |
|
(float)-std::numeric_limits<half>::max()); |
|
|
|
Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits()); |
|
|
|
// |
|
// Xdr::read and Xdr::write will advance the ptr |
|
// |
|
} |
|
|
|
_rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr; |
|
tmpHalfBufferPtr += _width * sizeof (unsigned short); |
|
} |
|
} |
|
|
|
// |
|
// Pack DC components together by common plane, so we can get |
|
// a little more out of differencing them. We'll always have |
|
// one component per block, so we can computed offsets. |
|
// |
|
|
|
currDcComp[0] = (unsigned short *)_packedDc; |
|
|
|
for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan) |
|
currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY; |
|
|
|
for (int blocky = 0; blocky < numBlocksY; ++blocky) |
|
{ |
|
for (int blockx = 0; blockx < numBlocksX; ++blockx) |
|
{ |
|
half h; |
|
unsigned short tmpShortXdr, tmpShortNative; |
|
char *tmpCharPtr; |
|
|
|
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan) |
|
{ |
|
// |
|
// Break the source into 8x8 blocks. If we don't |
|
// fit at the edges, mirror. |
|
// |
|
// Also, convert from linear to nonlinear representation. |
|
// Our source is assumed to be XDR, and we need to convert |
|
// to NATIVE prior to converting to float. |
|
// |
|
// If we're converting linear -> nonlinear, assume that the |
|
// XDR -> NATIVE conversion is built into the lookup. Otherwise, |
|
// we'll need to explicitly do it. |
|
// |
|
|
|
for (int y = 0; y < 8; ++y) |
|
{ |
|
for (int x = 0; x < 8; ++x) |
|
{ |
|
int vx = 8 * blockx + x; |
|
int vy = 8 * blocky + y; |
|
|
|
if (vx >= _width) |
|
vx = _width - (vx - (_width - 1)); |
|
|
|
if (vx < 0) vx = _width-1; |
|
|
|
if (vy >=_height) |
|
vy = _height - (vy - (_height - 1)); |
|
|
|
if (vy < 0) vy = _height-1; |
|
|
|
tmpShortXdr = |
|
((const unsigned short *)(_rowPtrs[chan])[vy])[vx]; |
|
|
|
if (_toNonlinear) |
|
{ |
|
h.setBits (_toNonlinear[tmpShortXdr]); |
|
} |
|
else |
|
{ |
|
const char *tmpConstCharPtr = |
|
(const char *)(&tmpShortXdr); |
|
|
|
Xdr::read<CharPtrIO> |
|
(tmpConstCharPtr, tmpShortNative); |
|
|
|
h.setBits(tmpShortNative); |
|
} |
|
|
|
_dctData[chan]._buffer[y * 8 + x] = (float)h; |
|
} // x |
|
} // y |
|
} // chan |
|
|
|
// |
|
// Color space conversion |
|
// |
|
|
|
if (_rowPtrs.size() == 3) |
|
{ |
|
csc709Forward64 (_dctData[0]._buffer, |
|
_dctData[1]._buffer, |
|
_dctData[2]._buffer); |
|
} |
|
|
|
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan) |
|
{ |
|
// |
|
// Forward DCT |
|
// |
|
|
|
dctForward8x8(_dctData[chan]._buffer); |
|
|
|
// |
|
// Quantize to half, and zigzag |
|
// |
|
|
|
if (chan == 0) |
|
{ |
|
for (int i = 0; i < 64; ++i) |
|
{ |
|
halfCoef[i] = |
|
quantize ((half)_dctData[chan]._buffer[i], |
|
_quantBaseError*_quantTableY[i]); |
|
} |
|
} |
|
else |
|
{ |
|
for (int i = 0; i < 64; ++i) |
|
{ |
|
halfCoef[i] = |
|
quantize ((half)_dctData[chan]._buffer[i], |
|
_quantBaseError*_quantTableCbCr[i]); |
|
} |
|
} |
|
|
|
toZigZag (halfZigCoef, halfCoef); |
|
|
|
// |
|
// Convert from NATIVE back to XDR, before we write out |
|
// |
|
|
|
for (int i = 0; i < 64; ++i) |
|
{ |
|
tmpCharPtr = (char *)&tmpShortXdr; |
|
Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits()); |
|
halfZigCoef[i].setBits(tmpShortXdr); |
|
} |
|
|
|
// |
|
// Save the DC component separately, to be compressed on |
|
// its own. |
|
// |
|
|
|
*currDcComp[chan]++ = halfZigCoef[0].bits(); |
|
_numDcComp++; |
|
|
|
// |
|
// Then RLE the AC components (which will record the count |
|
// of the resulting number of items) |
|
// |
|
|
|
rleAc (halfZigCoef, currAcComp); |
|
} // chan |
|
} // blockx |
|
} // blocky |
|
} |
|
|
|
|
|
// |
|
// Reorder from zig-zag order to normal ordering |
|
// |
|
|
|
void |
|
DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src) |
|
{ |
|
const int remap[] = |
|
{ |
|
0, |
|
1, 8, |
|
16, 9, 2, |
|
3, 10, 17, 24, |
|
32, 25, 18, 11, 4, |
|
5, 12, 19, 26, 33, 40, |
|
48, 41, 34, 27, 20, 13, 6, |
|
7, 14, 21, 28, 35, 42, 49, 56, |
|
57, 50, 43, 36, 29, 22, 15, |
|
23, 30, 37, 44, 51, 58, |
|
59, 52, 45, 38, 31, |
|
39, 46, 53, 60, |
|
61, 54, 47, |
|
55, 62, |
|
63 |
|
}; |
|
|
|
for (int i=0; i<64; ++i) |
|
dst[i] = src[remap[i]]; |
|
} |
|
|
|
|
|
// |
|
// Precomputing the bit count runs faster than using |
|
// the builtin instruction, at least in one case.. |
|
// |
|
// Precomputing 8-bits is no slower than 16-bits, |
|
// and saves a fair bit of overhead.. |
|
// |
|
|
|
int |
|
DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src) |
|
{ |
|
static const unsigned short numBitsSet[256] = |
|
{ |
|
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, |
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 |
|
}; |
|
|
|
return numBitsSet[src & 0xff] + numBitsSet[src >> 8]; |
|
} |
|
|
|
|
|
// |
|
// Take a DCT coefficient, as well as an acceptable error. Search |
|
// nearby values within the error tolerance, that have fewer |
|
// bits set. |
|
// |
|
// The list of candidates has been pre-computed and sorted |
|
// in order of increasing numbers of bits set. This way, we |
|
// can stop searching as soon as we find a candidate that |
|
// is within the error tolerance. |
|
// |
|
|
|
half |
|
DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance) |
|
{ |
|
half tmp; |
|
float srcFloat = (float)src; |
|
int numSetBits = countSetBits(src.bits()); |
|
const unsigned short *closest = get_dwaClosest(src.bits()); |
|
|
|
for (int targetNumSetBits = numSetBits - 1; |
|
targetNumSetBits >= 0; |
|
--targetNumSetBits) |
|
{ |
|
tmp.setBits (*closest); |
|
|
|
if (fabs ((float)tmp - srcFloat) < errorTolerance) |
|
return tmp; |
|
|
|
closest++; |
|
} |
|
|
|
return src; |
|
} |
|
|
|
|
|
// |
|
// RLE the zig-zag of the AC components + copy over |
|
// into another tmp buffer |
|
// |
|
// Try to do a simple RLE scheme to reduce run's of 0's. This |
|
// differs from the jpeg EOB case, since EOB just indicates that |
|
// the rest of the block is zero. In our case, we have lots of |
|
// NaN symbols, which shouldn't be allowed to occur in DCT |
|
// coefficents - so we'll use them for encoding runs. |
|
// |
|
// If the high byte is 0xff, then we have a run of 0's, of length |
|
// given by the low byte. For example, 0xff03 would be a run |
|
// of 3 0's, starting at the current location. |
|
// |
|
// block is our block of 64 coefficients |
|
// acPtr a pointer to back the RLE'd values into. |
|
// |
|
// This will advance the counter, _numAcComp. |
|
// |
|
|
|
void |
|
DwaCompressor::LossyDctEncoderBase::rleAc |
|
(half *block, |
|
unsigned short *&acPtr) |
|
{ |
|
int dctComp = 1; |
|
unsigned short rleSymbol = 0x0; |
|
|
|
while (dctComp < 64) |
|
{ |
|
int runLen = 1; |
|
|
|
// |
|
// If we don't have a 0, output verbatim |
|
// |
|
|
|
if (block[dctComp].bits() != rleSymbol) |
|
{ |
|
*acPtr++ = block[dctComp].bits(); |
|
_numAcComp++; |
|
|
|
dctComp += runLen; |
|
continue; |
|
} |
|
|
|
// |
|
// We're sitting on a 0, so see how big the run is. |
|
// |
|
|
|
while ((dctComp+runLen < 64) && |
|
(block[dctComp+runLen].bits() == rleSymbol)) |
|
{ |
|
runLen++; |
|
} |
|
|
|
// |
|
// If the run len is too small, just output verbatim |
|
// otherwise output our run token |
|
// |
|
// Originally, we wouldn't have a separate symbol for |
|
// "end of block". But in some experimentation, it looks |
|
// like using 0xff00 for "end of block" can save a bit |
|
// of space. |
|
// |
|
|
|
if (runLen == 1) |
|
{ |
|
runLen = 1; |
|
*acPtr++ = block[dctComp].bits(); |
|
_numAcComp++; |
|
|
|
// |
|
// Using 0xff00 for "end of block" |
|
// |
|
} |
|
else if (runLen + dctComp == 64) |
|
{ |
|
// |
|
// Signal EOB |
|
// |
|
|
|
*acPtr++ = 0xff00; |
|
_numAcComp++; |
|
} |
|
else |
|
{ |
|
// |
|
// Signal normal run |
|
// |
|
|
|
*acPtr++ = 0xff00 | runLen; |
|
_numAcComp++; |
|
} |
|
|
|
// |
|
// Advance by runLen |
|
// |
|
|
|
dctComp += runLen; |
|
} |
|
} |
|
|
|
|
|
// ============================================================== |
|
// |
|
// DwaCompressor |
|
// |
|
// -------------------------------------------------------------- |
|
|
|
// |
|
// DwaCompressor() |
|
// |
|
|
|
DwaCompressor::DwaCompressor |
|
(const Header &hdr, |
|
int maxScanLineSize, |
|
int numScanLines, |
|
AcCompression acCompression) |
|
: |
|
Compressor(hdr), |
|
_acCompression(acCompression), |
|
_maxScanLineSize(maxScanLineSize), |
|
_numScanLines(numScanLines), |
|
_channels(hdr.channels()), |
|
_packedAcBuffer(0), |
|
_packedAcBufferSize(0), |
|
_packedDcBuffer(0), |
|
_packedDcBufferSize(0), |
|
_rleBuffer(0), |
|
_rleBufferSize(0), |
|
_outBuffer(0), |
|
_outBufferSize(0), |
|
_zip(0), |
|
_dwaCompressionLevel(45.0) |
|
{ |
|
_min[0] = hdr.dataWindow().min.x; |
|
_min[1] = hdr.dataWindow().min.y; |
|
_max[0] = hdr.dataWindow().max.x; |
|
_max[1] = hdr.dataWindow().max.y; |
|
|
|
for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i) |
|
{ |
|
_planarUncBuffer[i] = 0; |
|
_planarUncBufferSize[i] = 0; |
|
} |
|
|
|
// |
|
// Check the header for a quality attribute |
|
// |
|
|
|
if (hasDwaCompressionLevel (hdr)) |
|
_dwaCompressionLevel = dwaCompressionLevel (hdr); |
|
} |
|
|
|
|
|
DwaCompressor::~DwaCompressor() |
|
{ |
|
delete[] _packedAcBuffer; |
|
delete[] _packedDcBuffer; |
|
delete[] _rleBuffer; |
|
delete[] _outBuffer; |
|
delete _zip; |
|
|
|
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i) |
|
delete[] _planarUncBuffer[i]; |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::numScanLines() const |
|
{ |
|
return _numScanLines; |
|
} |
|
|
|
|
|
OPENEXR_IMF_NAMESPACE::Compressor::Format |
|
DwaCompressor::format() const |
|
{ |
|
if (GLOBAL_SYSTEM_LITTLE_ENDIAN) |
|
return NATIVE; |
|
else |
|
return XDR; |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::compress |
|
(const char *inPtr, |
|
int inSize, |
|
int minY, |
|
const char *&outPtr) |
|
{ |
|
return compress |
|
(inPtr, |
|
inSize, |
|
IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY), |
|
IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)), |
|
outPtr); |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::compressTile |
|
(const char *inPtr, |
|
int inSize, |
|
IMATH_NAMESPACE::Box2i range, |
|
const char *&outPtr) |
|
{ |
|
return compress (inPtr, inSize, range, outPtr); |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::compress |
|
(const char *inPtr, |
|
int inSize, |
|
IMATH_NAMESPACE::Box2i range, |
|
const char *&outPtr) |
|
{ |
|
const char *inDataPtr = inPtr; |
|
char *packedAcEnd = 0; |
|
char *packedDcEnd = 0; |
|
int fileVersion = 2; // Starting with 2, we write the channel |
|
// classification rules into the file |
|
|
|
if (fileVersion < 2) |
|
initializeLegacyChannelRules(); |
|
else |
|
initializeDefaultChannelRules(); |
|
|
|
size_t outBufferSize = 0; |
|
initializeBuffers(outBufferSize); |
|
|
|
unsigned short channelRuleSize = 0; |
|
std::vector<Classifier> channelRules; |
|
if (fileVersion >= 2) |
|
{ |
|
relevantChannelRules(channelRules); |
|
|
|
channelRuleSize = Xdr::size<unsigned short>(); |
|
for (size_t i = 0; i < channelRules.size(); ++i) |
|
channelRuleSize += channelRules[i].size(); |
|
} |
|
|
|
// |
|
// Remember to allocate _outBuffer, if we haven't done so already. |
|
// |
|
|
|
outBufferSize += channelRuleSize; |
|
if (outBufferSize > _outBufferSize) |
|
{ |
|
_outBufferSize = outBufferSize; |
|
if (_outBuffer != 0) |
|
delete[] _outBuffer; |
|
_outBuffer = new char[outBufferSize]; |
|
} |
|
|
|
char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64) + |
|
channelRuleSize]; |
|
|
|
// |
|
// We might not be dealing with any color data, in which |
|
// case the AC buffer size will be 0, and deferencing |
|
// a vector will not be a good thing to do. |
|
// |
|
|
|
if (_packedAcBuffer) |
|
packedAcEnd = _packedAcBuffer; |
|
|
|
if (_packedDcBuffer) |
|
packedDcEnd = _packedDcBuffer; |
|
|
|
#define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)] |
|
|
|
Int64 *version = OBIDX (VERSION); |
|
Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE); |
|
Int64 *unknownCompressedSize = OBIDX (UNKNOWN_COMPRESSED_SIZE); |
|
Int64 *acCompressedSize = OBIDX (AC_COMPRESSED_SIZE); |
|
Int64 *dcCompressedSize = OBIDX (DC_COMPRESSED_SIZE); |
|
Int64 *rleCompressedSize = OBIDX (RLE_COMPRESSED_SIZE); |
|
Int64 *rleUncompressedSize = OBIDX (RLE_UNCOMPRESSED_SIZE); |
|
Int64 *rleRawSize = OBIDX (RLE_RAW_SIZE); |
|
|
|
Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT); |
|
Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT); |
|
|
|
Int64 *acCompression = OBIDX (AC_COMPRESSION); |
|
|
|
int minX = range.min.x; |
|
int maxX = std::min(range.max.x, _max[0]); |
|
int minY = range.min.y; |
|
int maxY = std::min(range.max.y, _max[1]); |
|
|
|
// |
|
// Zero all the numbers in the chunk header |
|
// |
|
|
|
memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64)); |
|
|
|
// |
|
// Setup the AC compression strategy and the version in the data block, |
|
// then write the relevant channel classification rules if needed |
|
// |
|
*version = fileVersion; |
|
*acCompression = _acCompression; |
|
|
|
setupChannelData (minX, minY, maxX, maxY); |
|
|
|
if (fileVersion >= 2) |
|
{ |
|
char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64)]; |
|
Xdr::write<CharPtrIO> (writePtr, channelRuleSize); |
|
|
|
for (size_t i = 0; i < channelRules.size(); ++i) |
|
channelRules[i].write(writePtr); |
|
} |
|
|
|
// |
|
// Determine the start of each row in the input buffer |
|
// Channels are interleaved by scanline |
|
// |
|
|
|
std::vector<bool> encodedChannels (_channelData.size()); |
|
std::vector< std::vector<const char *> > rowPtrs (_channelData.size()); |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
encodedChannels[chan] = false; |
|
|
|
inDataPtr = inPtr; |
|
|
|
for (int y = minY; y <= maxY; ++y) |
|
{ |
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
|
|
ChannelData *cd = &_channelData[chan]; |
|
|
|
if (IMATH_NAMESPACE::modp(y, cd->ySampling) != 0) |
|
continue; |
|
|
|
rowPtrs[chan].push_back(inDataPtr); |
|
inDataPtr += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); |
|
} |
|
} |
|
|
|
inDataPtr = inPtr; |
|
|
|
// |
|
// Make a pass over all our CSC sets and try to encode them first |
|
// |
|
|
|
for (unsigned int csc = 0; csc < _cscSets.size(); ++csc) |
|
{ |
|
|
|
LossyDctEncoderCsc encoder |
|
(_dwaCompressionLevel / 100000.f, |
|
rowPtrs[_cscSets[csc].idx[0]], |
|
rowPtrs[_cscSets[csc].idx[1]], |
|
rowPtrs[_cscSets[csc].idx[2]], |
|
packedAcEnd, |
|
packedDcEnd, |
|
get_dwaCompressorToNonlinear(), |
|
_channelData[_cscSets[csc].idx[0]].width, |
|
_channelData[_cscSets[csc].idx[0]].height, |
|
_channelData[_cscSets[csc].idx[0]].type, |
|
_channelData[_cscSets[csc].idx[1]].type, |
|
_channelData[_cscSets[csc].idx[2]].type); |
|
|
|
encoder.execute(); |
|
|
|
*totalAcUncompressedCount += encoder.numAcValuesEncoded(); |
|
*totalDcUncompressedCount += encoder.numDcValuesEncoded(); |
|
|
|
packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short); |
|
packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short); |
|
|
|
encodedChannels[_cscSets[csc].idx[0]] = true; |
|
encodedChannels[_cscSets[csc].idx[1]] = true; |
|
encodedChannels[_cscSets[csc].idx[2]] = true; |
|
} |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
ChannelData *cd = &_channelData[chan]; |
|
|
|
if (encodedChannels[chan]) |
|
continue; |
|
|
|
switch (cd->compression) |
|
{ |
|
case LOSSY_DCT: |
|
|
|
// |
|
// For LOSSY_DCT, treat this just like the CSC'd case, |
|
// but only operate on one channel |
|
// |
|
|
|
{ |
|
const unsigned short *nonlinearLut = 0; |
|
|
|
if (!cd->pLinear) |
|
nonlinearLut = get_dwaCompressorToNonlinear(); |
|
|
|
LossyDctEncoder encoder |
|
(_dwaCompressionLevel / 100000.f, |
|
rowPtrs[chan], |
|
packedAcEnd, |
|
packedDcEnd, |
|
nonlinearLut, |
|
cd->width, |
|
cd->height, |
|
cd->type); |
|
|
|
encoder.execute(); |
|
|
|
*totalAcUncompressedCount += encoder.numAcValuesEncoded(); |
|
*totalDcUncompressedCount += encoder.numDcValuesEncoded(); |
|
|
|
packedAcEnd += |
|
encoder.numAcValuesEncoded() * sizeof (unsigned short); |
|
|
|
packedDcEnd += |
|
encoder.numDcValuesEncoded() * sizeof (unsigned short); |
|
} |
|
|
|
break; |
|
|
|
case RLE: |
|
|
|
// |
|
// For RLE, bash the bytes up so that the first bytes of each |
|
// pixel are contingous, as are the second bytes, and so on. |
|
// |
|
|
|
for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y) |
|
{ |
|
const char *row = rowPtrs[chan][y]; |
|
|
|
for (int x = 0; x < cd->width; ++x) |
|
{ |
|
for (int byte = 0; |
|
byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type); |
|
++byte) |
|
{ |
|
|
|
*cd->planarUncRleEnd[byte]++ = *row++; |
|
} |
|
} |
|
|
|
*rleRawSize += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); |
|
} |
|
|
|
break; |
|
|
|
case UNKNOWN: |
|
|
|
// |
|
// Otherwise, just copy data over verbatim |
|
// |
|
|
|
{ |
|
int scanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); |
|
|
|
for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y) |
|
{ |
|
memcpy (cd->planarUncBufferEnd, |
|
rowPtrs[chan][y], |
|
scanlineSize); |
|
|
|
cd->planarUncBufferEnd += scanlineSize; |
|
} |
|
|
|
*unknownUncompressedSize += cd->planarUncSize; |
|
} |
|
|
|
break; |
|
|
|
default: |
|
|
|
assert (false); |
|
} |
|
|
|
encodedChannels[chan] = true; |
|
} |
|
|
|
// |
|
// Pack the Unknown data into the output buffer first. Instead of |
|
// just copying it uncompressed, try zlib compression at least. |
|
// |
|
|
|
if (*unknownUncompressedSize > 0) |
|
{ |
|
uLongf inSize = (uLongf)(*unknownUncompressedSize); |
|
uLongf outSize = compressBound (inSize); |
|
|
|
if (Z_OK != ::compress2 ((Bytef *)outDataPtr, |
|
&outSize, |
|
(const Bytef *)_planarUncBuffer[UNKNOWN], |
|
inSize, |
|
9)) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc ("Data compression (zlib) failed."); |
|
} |
|
|
|
outDataPtr += outSize; |
|
*unknownCompressedSize = outSize; |
|
} |
|
|
|
// |
|
// Now, pack all the Lossy DCT coefficients into our output |
|
// buffer, with Huffman encoding. |
|
// |
|
// Also, record the compressed size and the number of |
|
// uncompressed componentns we have. |
|
// |
|
|
|
if (*totalAcUncompressedCount > 0) |
|
{ |
|
switch (_acCompression) |
|
{ |
|
case STATIC_HUFFMAN: |
|
|
|
*acCompressedSize = (int) |
|
hufCompress((unsigned short *)_packedAcBuffer, |
|
(int)*totalAcUncompressedCount, |
|
outDataPtr); |
|
break; |
|
|
|
case DEFLATE: |
|
|
|
{ |
|
uLongf destLen = compressBound ( |
|
(*totalAcUncompressedCount) * sizeof (unsigned short)); |
|
|
|
if (Z_OK != ::compress2 |
|
((Bytef *)outDataPtr, |
|
&destLen, |
|
(Bytef *)_packedAcBuffer, |
|
(uLong)(*totalAcUncompressedCount |
|
* sizeof (unsigned short)), |
|
9)) |
|
{ |
|
throw IEX_NAMESPACE::InputExc ("Data compression (zlib) failed."); |
|
} |
|
|
|
*acCompressedSize = destLen; |
|
} |
|
|
|
break; |
|
|
|
default: |
|
|
|
assert (false); |
|
} |
|
|
|
outDataPtr += *acCompressedSize; |
|
} |
|
|
|
// |
|
// Handle the DC components separately |
|
// |
|
|
|
if (*totalDcUncompressedCount > 0) |
|
{ |
|
*dcCompressedSize = _zip->compress |
|
(_packedDcBuffer, |
|
(int)(*totalDcUncompressedCount) * sizeof (unsigned short), |
|
outDataPtr); |
|
|
|
outDataPtr += *dcCompressedSize; |
|
} |
|
|
|
// |
|
// If we have RLE data, first RLE encode it and set the uncompressed |
|
// size. Then, deflate the results and set the compressed size. |
|
// |
|
|
|
if (*rleRawSize > 0) |
|
{ |
|
*rleUncompressedSize = rleCompress |
|
((int)(*rleRawSize), |
|
_planarUncBuffer[RLE], |
|
(signed char *)_rleBuffer); |
|
|
|
uLongf dstLen = compressBound ((uLongf)*rleUncompressedSize); |
|
|
|
if (Z_OK != ::compress2 |
|
((Bytef *)outDataPtr, |
|
&dstLen, |
|
(Bytef *)_rleBuffer, |
|
(uLong)(*rleUncompressedSize), |
|
9)) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc ("Error compressing RLE'd data."); |
|
} |
|
|
|
*rleCompressedSize = dstLen; |
|
outDataPtr += *rleCompressedSize; |
|
} |
|
|
|
// |
|
// Flip the counters to XDR format |
|
// |
|
|
|
for (int i = 0; i < NUM_SIZES_SINGLE; ++i) |
|
{ |
|
Int64 src = *(((Int64 *)_outBuffer) + i); |
|
char *dst = (char *)(((Int64 *)_outBuffer) + i); |
|
|
|
Xdr::write<CharPtrIO> (dst, src); |
|
} |
|
|
|
// |
|
// We're done - compute the number of bytes we packed |
|
// |
|
|
|
outPtr = _outBuffer; |
|
|
|
return static_cast<int>(outDataPtr - _outBuffer + 1); |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::uncompress |
|
(const char *inPtr, |
|
int inSize, |
|
int minY, |
|
const char *&outPtr) |
|
{ |
|
return uncompress (inPtr, |
|
inSize, |
|
IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY), |
|
IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)), |
|
outPtr); |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::uncompressTile |
|
(const char *inPtr, |
|
int inSize, |
|
IMATH_NAMESPACE::Box2i range, |
|
const char *&outPtr) |
|
{ |
|
return uncompress (inPtr, inSize, range, outPtr); |
|
} |
|
|
|
|
|
int |
|
DwaCompressor::uncompress |
|
(const char *inPtr, |
|
int inSize, |
|
IMATH_NAMESPACE::Box2i range, |
|
const char *&outPtr) |
|
{ |
|
int minX = range.min.x; |
|
int maxX = std::min (range.max.x, _max[0]); |
|
int minY = range.min.y; |
|
int maxY = std::min (range.max.y, _max[1]); |
|
|
|
int headerSize = NUM_SIZES_SINGLE*sizeof(Int64); |
|
if (inSize < headerSize) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(truncated header)."); |
|
} |
|
|
|
// |
|
// Flip the counters from XDR to NATIVE |
|
// |
|
|
|
for (int i = 0; i < NUM_SIZES_SINGLE; ++i) |
|
{ |
|
Int64 *dst = (((Int64 *)inPtr) + i); |
|
const char *src = (char *)(((Int64 *)inPtr) + i); |
|
|
|
Xdr::read<CharPtrIO> (src, *dst); |
|
} |
|
|
|
// |
|
// Unwind all the counter info |
|
// |
|
|
|
const Int64 *inPtr64 = (const Int64*) inPtr; |
|
|
|
Int64 version = *(inPtr64 + VERSION); |
|
Int64 unknownUncompressedSize = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE); |
|
Int64 unknownCompressedSize = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE); |
|
Int64 acCompressedSize = *(inPtr64 + AC_COMPRESSED_SIZE); |
|
Int64 dcCompressedSize = *(inPtr64 + DC_COMPRESSED_SIZE); |
|
Int64 rleCompressedSize = *(inPtr64 + RLE_COMPRESSED_SIZE); |
|
Int64 rleUncompressedSize = *(inPtr64 + RLE_UNCOMPRESSED_SIZE); |
|
Int64 rleRawSize = *(inPtr64 + RLE_RAW_SIZE); |
|
|
|
Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT); |
|
Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT); |
|
|
|
Int64 acCompression = *(inPtr64 + AC_COMPRESSION); |
|
|
|
Int64 compressedSize = unknownCompressedSize + |
|
acCompressedSize + |
|
dcCompressedSize + |
|
rleCompressedSize; |
|
|
|
const char *dataPtr = inPtr + NUM_SIZES_SINGLE * sizeof(Int64); |
|
|
|
/* Both the sum and individual sizes are checked in case of overflow. */ |
|
if (inSize < (headerSize + compressedSize) || |
|
inSize < unknownCompressedSize || |
|
inSize < acCompressedSize || |
|
inSize < dcCompressedSize || |
|
inSize < rleCompressedSize) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(truncated file)."); |
|
} |
|
|
|
if ((SInt64)unknownUncompressedSize < 0 || |
|
(SInt64)unknownCompressedSize < 0 || |
|
(SInt64)acCompressedSize < 0 || |
|
(SInt64)dcCompressedSize < 0 || |
|
(SInt64)rleCompressedSize < 0 || |
|
(SInt64)rleUncompressedSize < 0 || |
|
(SInt64)rleRawSize < 0 || |
|
(SInt64)totalAcUncompressedCount < 0 || |
|
(SInt64)totalDcUncompressedCount < 0) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (corrupt header)."); |
|
} |
|
|
|
if (version < 2) |
|
initializeLegacyChannelRules(); |
|
else |
|
{ |
|
unsigned short ruleSize = 0; |
|
Xdr::read<CharPtrIO>(dataPtr, ruleSize); |
|
|
|
if (ruleSize < 0) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (corrupt header file)."); |
|
|
|
headerSize += ruleSize; |
|
if (inSize < headerSize + compressedSize) |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
" (truncated file)."); |
|
|
|
_channelRules.clear(); |
|
ruleSize -= Xdr::size<unsigned short> (); |
|
while (ruleSize > 0) |
|
{ |
|
Classifier rule(dataPtr, ruleSize); |
|
|
|
_channelRules.push_back(rule); |
|
ruleSize -= rule.size(); |
|
} |
|
} |
|
|
|
|
|
size_t outBufferSize = 0; |
|
initializeBuffers(outBufferSize); |
|
|
|
// |
|
// Allocate _outBuffer, if we haven't done so already |
|
// |
|
|
|
if (_maxScanLineSize * numScanLines() > _outBufferSize) |
|
{ |
|
_outBufferSize = _maxScanLineSize * numScanLines(); |
|
if (_outBuffer != 0) |
|
delete[] _outBuffer; |
|
_outBuffer = new char[_maxScanLineSize * numScanLines()]; |
|
} |
|
|
|
|
|
char *outBufferEnd = _outBuffer; |
|
|
|
|
|
// |
|
// Find the start of the RLE packed AC components and |
|
// the DC components for each channel. This will be handy |
|
// if you want to decode the channels in parallel later on. |
|
// |
|
|
|
char *packedAcBufferEnd = 0; |
|
|
|
if (_packedAcBuffer) |
|
packedAcBufferEnd = _packedAcBuffer; |
|
|
|
char *packedDcBufferEnd = 0; |
|
|
|
if (_packedDcBuffer) |
|
packedDcBufferEnd = _packedDcBuffer; |
|
|
|
// |
|
// UNKNOWN data is packed first, followed by the |
|
// Huffman-compressed AC, then the DC values, |
|
// and then the zlib compressed RLE data. |
|
// |
|
|
|
const char *compressedUnknownBuf = dataPtr; |
|
|
|
const char *compressedAcBuf = compressedUnknownBuf + |
|
static_cast<ptrdiff_t>(unknownCompressedSize); |
|
const char *compressedDcBuf = compressedAcBuf + |
|
static_cast<ptrdiff_t>(acCompressedSize); |
|
const char *compressedRleBuf = compressedDcBuf + |
|
static_cast<ptrdiff_t>(dcCompressedSize); |
|
|
|
// |
|
// Sanity check that the version is something we expect. Right now, |
|
// we can decode version 0, 1, and 2. v1 adds 'end of block' symbols |
|
// to the AC RLE. v2 adds channel classification rules at the |
|
// start of the data block. |
|
// |
|
|
|
if (version > 2) |
|
throw IEX_NAMESPACE::InputExc ("Invalid version of compressed data block"); |
|
|
|
setupChannelData(minX, minY, maxX, maxY); |
|
|
|
// |
|
// Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN] |
|
// |
|
|
|
if (unknownCompressedSize > 0) |
|
{ |
|
if (unknownUncompressedSize > _planarUncBufferSize[UNKNOWN]) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(corrupt header)."); |
|
} |
|
|
|
uLongf outSize = (uLongf)unknownUncompressedSize; |
|
|
|
if (Z_OK != ::uncompress |
|
((Bytef *)_planarUncBuffer[UNKNOWN], |
|
&outSize, |
|
(Bytef *)compressedUnknownBuf, |
|
(uLong)unknownCompressedSize)) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc("Error uncompressing UNKNOWN data."); |
|
} |
|
} |
|
|
|
// |
|
// Uncompress the AC data into _packedAcBuffer |
|
// |
|
|
|
if (acCompressedSize > 0) |
|
{ |
|
if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(corrupt header)."); |
|
} |
|
|
|
// |
|
// Don't trust the user to get it right, look in the file. |
|
// |
|
|
|
switch (acCompression) |
|
{ |
|
case STATIC_HUFFMAN: |
|
|
|
hufUncompress |
|
(compressedAcBuf, |
|
(int)acCompressedSize, |
|
(unsigned short *)_packedAcBuffer, |
|
(int)totalAcUncompressedCount); |
|
|
|
break; |
|
|
|
case DEFLATE: |
|
{ |
|
uLongf destLen = |
|
(int)(totalAcUncompressedCount) * sizeof (unsigned short); |
|
|
|
if (Z_OK != ::uncompress |
|
((Bytef *)_packedAcBuffer, |
|
&destLen, |
|
(Bytef *)compressedAcBuf, |
|
(uLong)acCompressedSize)) |
|
{ |
|
throw IEX_NAMESPACE::InputExc ("Data decompression (zlib) failed."); |
|
} |
|
|
|
if (totalAcUncompressedCount * sizeof (unsigned short) != |
|
destLen) |
|
{ |
|
throw IEX_NAMESPACE::InputExc ("AC data corrupt."); |
|
} |
|
} |
|
break; |
|
|
|
default: |
|
|
|
throw IEX_NAMESPACE::NoImplExc ("Unknown AC Compression"); |
|
break; |
|
} |
|
} |
|
|
|
// |
|
// Uncompress the DC data into _packedDcBuffer |
|
// |
|
|
|
if (dcCompressedSize > 0) |
|
{ |
|
if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(corrupt header)."); |
|
} |
|
|
|
if (_zip->uncompress |
|
(compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer) |
|
!= (int)totalDcUncompressedCount * sizeof (unsigned short)) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc("DC data corrupt."); |
|
} |
|
} |
|
|
|
// |
|
// Uncompress the RLE data into _rleBuffer, then unRLE the results |
|
// into _planarUncBuffer[RLE] |
|
// |
|
|
|
if (rleRawSize > 0) |
|
{ |
|
if (rleUncompressedSize > _rleBufferSize || |
|
rleRawSize > _planarUncBufferSize[RLE]) |
|
{ |
|
throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data" |
|
"(corrupt header)."); |
|
} |
|
|
|
uLongf dstLen = (uLongf)rleUncompressedSize; |
|
|
|
if (Z_OK != ::uncompress |
|
((Bytef *)_rleBuffer, |
|
&dstLen, |
|
(Bytef *)compressedRleBuf, |
|
(uLong)rleCompressedSize)) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc("Error uncompressing RLE data."); |
|
} |
|
|
|
if (dstLen != rleUncompressedSize) |
|
throw IEX_NAMESPACE::BaseExc("RLE data corrupted"); |
|
|
|
if (rleUncompress |
|
((int)rleUncompressedSize, |
|
(int)rleRawSize, |
|
(signed char *)_rleBuffer, |
|
_planarUncBuffer[RLE]) != rleRawSize) |
|
{ |
|
throw IEX_NAMESPACE::BaseExc("RLE data corrupted"); |
|
} |
|
} |
|
|
|
// |
|
// Determine the start of each row in the output buffer |
|
// |
|
|
|
std::vector<bool> decodedChannels (_channelData.size()); |
|
std::vector< std::vector<char *> > rowPtrs (_channelData.size()); |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
decodedChannels[chan] = false; |
|
|
|
outBufferEnd = _outBuffer; |
|
|
|
for (int y = minY; y <= maxY; ++y) |
|
{ |
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
ChannelData *cd = &_channelData[chan]; |
|
|
|
if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0) |
|
continue; |
|
|
|
rowPtrs[chan].push_back (outBufferEnd); |
|
outBufferEnd += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type); |
|
} |
|
} |
|
|
|
// |
|
// Setup to decode each block of 3 channels that need to |
|
// be handled together |
|
// |
|
|
|
for (unsigned int csc = 0; csc < _cscSets.size(); ++csc) |
|
{ |
|
int rChan = _cscSets[csc].idx[0]; |
|
int gChan = _cscSets[csc].idx[1]; |
|
int bChan = _cscSets[csc].idx[2]; |
|
|
|
|
|
LossyDctDecoderCsc decoder |
|
(rowPtrs[rChan], |
|
rowPtrs[gChan], |
|
rowPtrs[bChan], |
|
packedAcBufferEnd, |
|
packedDcBufferEnd, |
|
get_dwaCompressorToLinear(), |
|
_channelData[rChan].width, |
|
_channelData[rChan].height, |
|
_channelData[rChan].type, |
|
_channelData[gChan].type, |
|
_channelData[bChan].type); |
|
|
|
decoder.execute(); |
|
|
|
packedAcBufferEnd += |
|
decoder.numAcValuesEncoded() * sizeof (unsigned short); |
|
|
|
packedDcBufferEnd += |
|
decoder.numDcValuesEncoded() * sizeof (unsigned short); |
|
|
|
decodedChannels[rChan] = true; |
|
decodedChannels[gChan] = true; |
|
decodedChannels[bChan] = true; |
|
} |
|
|
|
// |
|
// Setup to handle the remaining channels by themselves |
|
// |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
if (decodedChannels[chan]) |
|
continue; |
|
|
|
ChannelData *cd = &_channelData[chan]; |
|
int pixelSize = OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type); |
|
|
|
switch (cd->compression) |
|
{ |
|
case LOSSY_DCT: |
|
|
|
// |
|
// Setup a single-channel lossy DCT decoder pointing |
|
// at the output buffer |
|
// |
|
|
|
{ |
|
const unsigned short *linearLut = 0; |
|
|
|
if (!cd->pLinear) |
|
linearLut = get_dwaCompressorToLinear(); |
|
|
|
LossyDctDecoder decoder |
|
(rowPtrs[chan], |
|
packedAcBufferEnd, |
|
packedDcBufferEnd, |
|
linearLut, |
|
cd->width, |
|
cd->height, |
|
cd->type); |
|
|
|
decoder.execute(); |
|
|
|
packedAcBufferEnd += |
|
decoder.numAcValuesEncoded() * sizeof (unsigned short); |
|
|
|
packedDcBufferEnd += |
|
decoder.numDcValuesEncoded() * sizeof (unsigned short); |
|
} |
|
|
|
break; |
|
|
|
case RLE: |
|
|
|
// |
|
// For the RLE case, the data has been un-RLE'd into |
|
// planarUncRleEnd[], but is still split out by bytes. |
|
// We need to rearrange the bytes back into the correct |
|
// order in the output buffer; |
|
// |
|
|
|
{ |
|
int row = 0; |
|
|
|
for (int y = minY; y <= maxY; ++y) |
|
{ |
|
if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0) |
|
continue; |
|
|
|
char *dst = rowPtrs[chan][row]; |
|
|
|
if (pixelSize == 2) |
|
{ |
|
interleaveByte2 (dst, |
|
cd->planarUncRleEnd[0], |
|
cd->planarUncRleEnd[1], |
|
cd->width); |
|
|
|
cd->planarUncRleEnd[0] += cd->width; |
|
cd->planarUncRleEnd[1] += cd->width; |
|
} |
|
else |
|
{ |
|
for (int x = 0; x < cd->width; ++x) |
|
{ |
|
for (int byte = 0; byte < pixelSize; ++byte) |
|
{ |
|
*dst++ = *cd->planarUncRleEnd[byte]++; |
|
} |
|
} |
|
} |
|
|
|
row++; |
|
} |
|
} |
|
|
|
break; |
|
|
|
case UNKNOWN: |
|
|
|
// |
|
// In the UNKNOWN case, data is already in planarUncBufferEnd |
|
// and just needs to copied over to the output buffer |
|
// |
|
|
|
{ |
|
int row = 0; |
|
int dstScanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type); |
|
|
|
for (int y = minY; y <= maxY; ++y) |
|
{ |
|
if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0) |
|
continue; |
|
|
|
memcpy (rowPtrs[chan][row], |
|
cd->planarUncBufferEnd, |
|
dstScanlineSize); |
|
|
|
cd->planarUncBufferEnd += dstScanlineSize; |
|
row++; |
|
} |
|
} |
|
|
|
break; |
|
|
|
default: |
|
|
|
throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case"); |
|
break; |
|
} |
|
|
|
decodedChannels[chan] = true; |
|
} |
|
|
|
// |
|
// Return a ptr to _outBuffer |
|
// |
|
|
|
outPtr = _outBuffer; |
|
return (int)(outBufferEnd - _outBuffer); |
|
} |
|
|
|
|
|
// static |
|
void |
|
DwaCompressor::initializeFuncs() |
|
{ |
|
convertFloatToHalf64 = convertFloatToHalf64_scalar; |
|
fromHalfZigZag = fromHalfZigZag_scalar; |
|
|
|
CpuId cpuId; |
|
|
|
// |
|
// Setup HALF <-> FLOAT conversion implementations |
|
// |
|
|
|
if (cpuId.avx && cpuId.f16c) |
|
{ |
|
convertFloatToHalf64 = convertFloatToHalf64_f16c; |
|
fromHalfZigZag = fromHalfZigZag_f16c; |
|
} |
|
|
|
// |
|
// Setup inverse DCT implementations |
|
// |
|
|
|
dctInverse8x8_0 = dctInverse8x8_scalar<0>; |
|
dctInverse8x8_1 = dctInverse8x8_scalar<1>; |
|
dctInverse8x8_2 = dctInverse8x8_scalar<2>; |
|
dctInverse8x8_3 = dctInverse8x8_scalar<3>; |
|
dctInverse8x8_4 = dctInverse8x8_scalar<4>; |
|
dctInverse8x8_5 = dctInverse8x8_scalar<5>; |
|
dctInverse8x8_6 = dctInverse8x8_scalar<6>; |
|
dctInverse8x8_7 = dctInverse8x8_scalar<7>; |
|
|
|
if (cpuId.avx) |
|
{ |
|
dctInverse8x8_0 = dctInverse8x8_avx<0>; |
|
dctInverse8x8_1 = dctInverse8x8_avx<1>; |
|
dctInverse8x8_2 = dctInverse8x8_avx<2>; |
|
dctInverse8x8_3 = dctInverse8x8_avx<3>; |
|
dctInverse8x8_4 = dctInverse8x8_avx<4>; |
|
dctInverse8x8_5 = dctInverse8x8_avx<5>; |
|
dctInverse8x8_6 = dctInverse8x8_avx<6>; |
|
dctInverse8x8_7 = dctInverse8x8_avx<7>; |
|
} |
|
else if (cpuId.sse2) |
|
{ |
|
dctInverse8x8_0 = dctInverse8x8_sse2<0>; |
|
dctInverse8x8_1 = dctInverse8x8_sse2<1>; |
|
dctInverse8x8_2 = dctInverse8x8_sse2<2>; |
|
dctInverse8x8_3 = dctInverse8x8_sse2<3>; |
|
dctInverse8x8_4 = dctInverse8x8_sse2<4>; |
|
dctInverse8x8_5 = dctInverse8x8_sse2<5>; |
|
dctInverse8x8_6 = dctInverse8x8_sse2<6>; |
|
dctInverse8x8_7 = dctInverse8x8_sse2<7>; |
|
} |
|
} |
|
|
|
|
|
// |
|
// Handle channel classification and buffer allocation once we know |
|
// how to classify channels |
|
// |
|
|
|
void |
|
DwaCompressor::initializeBuffers (size_t &outBufferSize) |
|
{ |
|
classifyChannels (_channels, _channelData, _cscSets); |
|
|
|
// |
|
// _outBuffer needs to be big enough to hold all our |
|
// compressed data - which could vary depending on what sort |
|
// of channels we have. |
|
// |
|
|
|
int maxOutBufferSize = 0; |
|
int numLossyDctChans = 0; |
|
int unknownBufferSize = 0; |
|
int rleBufferSize = 0; |
|
|
|
int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) * |
|
(int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) * |
|
63 * sizeof (unsigned short); |
|
|
|
int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) * |
|
(int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) * |
|
sizeof (unsigned short); |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
switch (_channelData[chan].compression) |
|
{ |
|
case LOSSY_DCT: |
|
|
|
// |
|
// This is the size of the number of packed |
|
// components, plus the requirements for |
|
// maximum Huffman encoding size (for STATIC_HUFFMAN) |
|
// or for zlib compression (for DEFLATE) |
|
// |
|
|
|
maxOutBufferSize += std::max( |
|
(int)(2 * maxLossyDctAcSize + 65536), |
|
(int)compressBound (maxLossyDctAcSize) ); |
|
numLossyDctChans++; |
|
break; |
|
|
|
case RLE: |
|
{ |
|
// |
|
// RLE, if gone horribly wrong, could double the size |
|
// of the source data. |
|
// |
|
|
|
int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) * |
|
OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type); |
|
|
|
rleBufferSize += rleAmount; |
|
} |
|
break; |
|
|
|
|
|
case UNKNOWN: |
|
|
|
unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) * |
|
OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type); |
|
break; |
|
|
|
default: |
|
|
|
throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case"); |
|
break; |
|
} |
|
} |
|
|
|
// |
|
// Also, since the results of the RLE are packed into |
|
// the output buffer, we need the extra room there. But |
|
// we're going to zlib compress() the data we pack, |
|
// which could take slightly more space |
|
// |
|
|
|
maxOutBufferSize += (int)compressBound ((uLongf)rleBufferSize); |
|
|
|
// |
|
// And the same goes for the UNKNOWN data |
|
// |
|
|
|
maxOutBufferSize += (int)compressBound ((uLongf)unknownBufferSize); |
|
|
|
// |
|
// Allocate a zip/deflate compressor big enought to hold the DC data |
|
// and include it's compressed results in the size requirements |
|
// for our output buffer |
|
// |
|
|
|
if (_zip == 0) |
|
_zip = new Zip (maxLossyDctDcSize * numLossyDctChans); |
|
else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans) |
|
{ |
|
delete _zip; |
|
_zip = new Zip (maxLossyDctDcSize * numLossyDctChans); |
|
} |
|
|
|
|
|
maxOutBufferSize += _zip->maxCompressedSize(); |
|
|
|
// |
|
// We also need to reserve space at the head of the buffer to |
|
// write out the size of our various packed and compressed data. |
|
// |
|
|
|
maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64); |
|
|
|
|
|
// |
|
// Later, we're going to hijack outBuffer for the result of |
|
// both encoding and decoding. So it needs to be big enough |
|
// to hold either a buffers' worth of uncompressed or |
|
// compressed data |
|
// |
|
// For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes, |
|
// but for decoding, we only need it to be maxScanLineSize*numScanLines. |
|
// Cache the max size for now, and alloc the buffer when we either |
|
// encode or decode. |
|
// |
|
|
|
outBufferSize = maxOutBufferSize; |
|
|
|
|
|
// |
|
// _packedAcBuffer holds the quantized DCT coefficients prior |
|
// to Huffman encoding |
|
// |
|
|
|
if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize) |
|
{ |
|
_packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans; |
|
if (_packedAcBuffer != 0) |
|
delete[] _packedAcBuffer; |
|
_packedAcBuffer = new char[_packedAcBufferSize]; |
|
} |
|
|
|
// |
|
// _packedDcBuffer holds one quantized DCT coef per 8x8 block |
|
// |
|
|
|
if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize) |
|
{ |
|
_packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans; |
|
if (_packedDcBuffer != 0) |
|
delete[] _packedDcBuffer; |
|
_packedDcBuffer = new char[_packedDcBufferSize]; |
|
} |
|
|
|
if (rleBufferSize > _rleBufferSize) |
|
{ |
|
_rleBufferSize = rleBufferSize; |
|
if (_rleBuffer != 0) |
|
delete[] _rleBuffer; |
|
_rleBuffer = new char[rleBufferSize]; |
|
} |
|
|
|
// |
|
// The planar uncompressed buffer will hold float data for LOSSY_DCT |
|
// compressed values, and whatever the native type is for other |
|
// channels. We're going to use this to hold data in a planar |
|
// format, as opposed to the native interleaved format we take |
|
// into compress() and give back from uncompress(). |
|
// |
|
// This also makes it easier to compress the UNKNOWN and RLE data |
|
// all in one swoop (for each compression scheme). |
|
// |
|
|
|
int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES]; |
|
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i) |
|
planarUncBufferSize[i] = 0; |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
switch (_channelData[chan].compression) |
|
{ |
|
case LOSSY_DCT: |
|
break; |
|
|
|
case RLE: |
|
planarUncBufferSize[RLE] += |
|
numScanLines() * (_max[0] - _min[0] + 1) * |
|
OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type); |
|
break; |
|
|
|
case UNKNOWN: |
|
planarUncBufferSize[UNKNOWN] += |
|
numScanLines() * (_max[0] - _min[0] + 1) * |
|
OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type); |
|
break; |
|
|
|
default: |
|
throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case"); |
|
break; |
|
} |
|
} |
|
|
|
// |
|
// UNKNOWN data is going to be zlib compressed, which needs |
|
// a little extra headroom |
|
// |
|
|
|
if (planarUncBufferSize[UNKNOWN] > 0) |
|
{ |
|
planarUncBufferSize[UNKNOWN] = |
|
compressBound ((uLongf)planarUncBufferSize[UNKNOWN]); |
|
} |
|
|
|
for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i) |
|
{ |
|
if (planarUncBufferSize[i] > _planarUncBufferSize[i]) |
|
{ |
|
_planarUncBufferSize[i] = planarUncBufferSize[i]; |
|
if (_planarUncBuffer[i] != 0) |
|
delete[] _planarUncBuffer[i]; |
|
_planarUncBuffer[i] = new char[planarUncBufferSize[i]]; |
|
} |
|
} |
|
} |
|
|
|
|
|
// |
|
// Setup channel classification rules to use when writing files |
|
// |
|
|
|
void |
|
DwaCompressor::initializeDefaultChannelRules () |
|
{ |
|
_channelRules.clear(); |
|
|
|
_channelRules.push_back (Classifier ("R", LOSSY_DCT, HALF, 0, false)); |
|
_channelRules.push_back (Classifier ("R", LOSSY_DCT, FLOAT, 0, false)); |
|
_channelRules.push_back (Classifier ("G", LOSSY_DCT, HALF, 1, false)); |
|
_channelRules.push_back (Classifier ("G", LOSSY_DCT, FLOAT, 1, false)); |
|
_channelRules.push_back (Classifier ("B", LOSSY_DCT, HALF, 2, false)); |
|
_channelRules.push_back (Classifier ("B", LOSSY_DCT, FLOAT, 2, false)); |
|
|
|
_channelRules.push_back (Classifier ("Y", LOSSY_DCT, HALF, -1, false)); |
|
_channelRules.push_back (Classifier ("Y", LOSSY_DCT, FLOAT, -1, false)); |
|
_channelRules.push_back (Classifier ("BY", LOSSY_DCT, HALF, -1, false)); |
|
_channelRules.push_back (Classifier ("BY", LOSSY_DCT, FLOAT, -1, false)); |
|
_channelRules.push_back (Classifier ("RY", LOSSY_DCT, HALF, -1, false)); |
|
_channelRules.push_back (Classifier ("RY", LOSSY_DCT, FLOAT, -1, false)); |
|
|
|
_channelRules.push_back (Classifier ("A", RLE, UINT, -1, false)); |
|
_channelRules.push_back (Classifier ("A", RLE, HALF, -1, false)); |
|
_channelRules.push_back (Classifier ("A", RLE, FLOAT, -1, false)); |
|
} |
|
|
|
|
|
// |
|
// Setup channel classification rules when reading files with VERSION < 2 |
|
// |
|
|
|
void |
|
DwaCompressor::initializeLegacyChannelRules () |
|
{ |
|
_channelRules.clear(); |
|
|
|
_channelRules.push_back (Classifier ("r", LOSSY_DCT, HALF, 0, true)); |
|
_channelRules.push_back (Classifier ("r", LOSSY_DCT, FLOAT, 0, true)); |
|
_channelRules.push_back (Classifier ("red", LOSSY_DCT, HALF, 0, true)); |
|
_channelRules.push_back (Classifier ("red", LOSSY_DCT, FLOAT, 0, true)); |
|
_channelRules.push_back (Classifier ("g", LOSSY_DCT, HALF, 1, true)); |
|
_channelRules.push_back (Classifier ("g", LOSSY_DCT, FLOAT, 1, true)); |
|
_channelRules.push_back (Classifier ("grn", LOSSY_DCT, HALF, 1, true)); |
|
_channelRules.push_back (Classifier ("grn", LOSSY_DCT, FLOAT, 1, true)); |
|
_channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF, 1, true)); |
|
_channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT, 1, true)); |
|
_channelRules.push_back (Classifier ("b", LOSSY_DCT, HALF, 2, true)); |
|
_channelRules.push_back (Classifier ("b", LOSSY_DCT, FLOAT, 2, true)); |
|
_channelRules.push_back (Classifier ("blu", LOSSY_DCT, HALF, 2, true)); |
|
_channelRules.push_back (Classifier ("blu", LOSSY_DCT, FLOAT, 2, true)); |
|
_channelRules.push_back (Classifier ("blue", LOSSY_DCT, HALF, 2, true)); |
|
_channelRules.push_back (Classifier ("blue", LOSSY_DCT, FLOAT, 2, true)); |
|
|
|
_channelRules.push_back (Classifier ("y", LOSSY_DCT, HALF, -1, true)); |
|
_channelRules.push_back (Classifier ("y", LOSSY_DCT, FLOAT, -1, true)); |
|
_channelRules.push_back (Classifier ("by", LOSSY_DCT, HALF, -1, true)); |
|
_channelRules.push_back (Classifier ("by", LOSSY_DCT, FLOAT, -1, true)); |
|
_channelRules.push_back (Classifier ("ry", LOSSY_DCT, HALF, -1, true)); |
|
_channelRules.push_back (Classifier ("ry", LOSSY_DCT, FLOAT, -1, true)); |
|
_channelRules.push_back (Classifier ("a", RLE, UINT, -1, true)); |
|
_channelRules.push_back (Classifier ("a", RLE, HALF, -1, true)); |
|
_channelRules.push_back (Classifier ("a", RLE, FLOAT, -1, true)); |
|
} |
|
|
|
|
|
// |
|
// Given a set of rules and ChannelData, figure out which rules apply |
|
// |
|
|
|
void |
|
DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const |
|
{ |
|
rules.clear(); |
|
|
|
std::vector<std::string> suffixes; |
|
|
|
for (size_t cd = 0; cd < _channelData.size(); ++cd) |
|
{ |
|
std::string suffix = _channelData[cd].name; |
|
size_t lastDot = suffix.find_last_of ('.'); |
|
|
|
if (lastDot != std::string::npos) |
|
suffix = suffix.substr (lastDot+1, std::string::npos); |
|
|
|
suffixes.push_back(suffix); |
|
} |
|
|
|
|
|
for (size_t i = 0; i < _channelRules.size(); ++i) |
|
{ |
|
for (size_t cd = 0; cd < _channelData.size(); ++cd) |
|
{ |
|
if (_channelRules[i].match (suffixes[cd], _channelData[cd].type )) |
|
{ |
|
rules.push_back (_channelRules[i]); |
|
break; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
// |
|
// Take our initial list of channels, and cache the contents. |
|
// |
|
// Determine approprate compression schemes for each channel, |
|
// and figure out which sets should potentially be CSC'ed |
|
// prior to lossy compression. |
|
// |
|
|
|
void |
|
DwaCompressor::classifyChannels |
|
(ChannelList channels, |
|
std::vector<ChannelData> &chanData, |
|
std::vector<CscChannelSet> &cscData) |
|
{ |
|
// |
|
// prefixMap used to map channel name prefixes to |
|
// potential CSC-able sets of channels. |
|
// |
|
|
|
std::map<std::string, DwaCompressor::CscChannelSet> prefixMap; |
|
std::vector<DwaCompressor::CscChannelSet> tmpCscSet; |
|
|
|
unsigned int numChan = 0; |
|
|
|
for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c) |
|
numChan++; |
|
|
|
if (numChan) |
|
chanData.resize (numChan); |
|
|
|
// |
|
// Cache the relevant data from the channel structs. |
|
// |
|
|
|
unsigned int offset = 0; |
|
|
|
for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c) |
|
{ |
|
chanData[offset].name = std::string (c.name()); |
|
chanData[offset].compression = UNKNOWN; |
|
chanData[offset].xSampling = c.channel().xSampling; |
|
chanData[offset].ySampling = c.channel().ySampling; |
|
chanData[offset].type = c.channel().type; |
|
chanData[offset].pLinear = c.channel().pLinear; |
|
|
|
offset++; |
|
} |
|
|
|
// |
|
// Try and figure out which channels should be |
|
// compressed by which means. |
|
// |
|
|
|
for (offset = 0; offset<numChan; ++offset) |
|
{ |
|
std::string prefix = ""; |
|
std::string suffix = chanData[offset].name; |
|
size_t lastDot = suffix.find_last_of ('.'); |
|
|
|
if (lastDot != std::string::npos) |
|
{ |
|
prefix = suffix.substr (0, lastDot); |
|
suffix = suffix.substr (lastDot+1, std::string::npos); |
|
} |
|
|
|
// |
|
// Make sure we have an entry in our CSC set map |
|
// |
|
|
|
std::map<std::string, DwaCompressor::CscChannelSet>::iterator |
|
theSet = prefixMap.find (prefix); |
|
|
|
if (theSet == prefixMap.end()) |
|
{ |
|
DwaCompressor::CscChannelSet tmpSet; |
|
|
|
tmpSet.idx[0] = |
|
tmpSet.idx[1] = |
|
tmpSet.idx[2] = -1; |
|
|
|
prefixMap[prefix] = tmpSet; |
|
} |
|
|
|
// |
|
// Check the suffix against the list of classifications |
|
// we defined previously. If the _cscIdx is not negative, |
|
// it indicates that we should be part of a CSC group. |
|
// |
|
|
|
for (std::vector<Classifier>::iterator i = _channelRules.begin(); |
|
i != _channelRules.end(); |
|
++i) |
|
{ |
|
if ( i->match(suffix, chanData[offset].type) ) |
|
{ |
|
chanData[offset].compression = i->_scheme; |
|
|
|
if ( i->_cscIdx >= 0) |
|
prefixMap[prefix].idx[i->_cscIdx] = offset; |
|
} |
|
} |
|
} |
|
|
|
// |
|
// Finally, try and find RGB sets of channels which |
|
// can be CSC'ed to a Y'CbCr space prior to loss, for |
|
// better compression. |
|
// |
|
// Walk over our set of candidates, and see who has |
|
// all three channels defined (and has common sampling |
|
// patterns, etc). |
|
// |
|
|
|
for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator |
|
theItem = prefixMap.begin(); theItem != prefixMap.end(); |
|
++theItem) |
|
{ |
|
int red = (*theItem).second.idx[0]; |
|
int grn = (*theItem).second.idx[1]; |
|
int blu = (*theItem).second.idx[2]; |
|
|
|
if ((red < 0) || (grn < 0) || (blu < 0)) |
|
continue; |
|
|
|
if ((chanData[red].xSampling != chanData[grn].xSampling) || |
|
(chanData[red].xSampling != chanData[blu].xSampling) || |
|
(chanData[grn].xSampling != chanData[blu].xSampling) || |
|
(chanData[red].ySampling != chanData[grn].ySampling) || |
|
(chanData[red].ySampling != chanData[blu].ySampling) || |
|
(chanData[grn].ySampling != chanData[blu].ySampling)) |
|
{ |
|
continue; |
|
} |
|
|
|
tmpCscSet.push_back ((*theItem).second); |
|
} |
|
|
|
size_t numCsc = tmpCscSet.size(); |
|
|
|
if (numCsc) |
|
cscData.resize(numCsc); |
|
|
|
for (offset = 0; offset < numCsc; ++offset) |
|
cscData[offset] = tmpCscSet[offset]; |
|
} |
|
|
|
|
|
|
|
// |
|
// Setup some buffer pointers, determine channel sizes, things |
|
// like that. |
|
// |
|
|
|
void |
|
DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY) |
|
{ |
|
char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES]; |
|
|
|
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i) |
|
{ |
|
planarUncBuffer[i] = 0; |
|
|
|
if (_planarUncBuffer[i]) |
|
planarUncBuffer[i] = _planarUncBuffer[i]; |
|
} |
|
|
|
for (unsigned int chan = 0; chan < _channelData.size(); ++chan) |
|
{ |
|
ChannelData *cd = &_channelData[chan]; |
|
|
|
cd->width = OPENEXR_IMF_NAMESPACE::numSamples (cd->xSampling, minX, maxX); |
|
cd->height = OPENEXR_IMF_NAMESPACE::numSamples (cd->ySampling, minY, maxY); |
|
|
|
cd->planarUncSize = |
|
cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type); |
|
|
|
cd->planarUncBuffer = planarUncBuffer[cd->compression]; |
|
cd->planarUncBufferEnd = cd->planarUncBuffer; |
|
|
|
cd->planarUncRle[0] = cd->planarUncBuffer; |
|
cd->planarUncRleEnd[0] = cd->planarUncRle[0]; |
|
|
|
for (int byte = 1; byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); ++byte) |
|
{ |
|
cd->planarUncRle[byte] = |
|
cd->planarUncRle[byte-1] + cd->width * cd->height; |
|
|
|
cd->planarUncRleEnd[byte] = |
|
cd->planarUncRle[byte]; |
|
} |
|
|
|
cd->planarUncType = cd->type; |
|
|
|
if (cd->compression == LOSSY_DCT) |
|
{ |
|
cd->planarUncType = FLOAT; |
|
} |
|
else |
|
{ |
|
planarUncBuffer[cd->compression] += |
|
cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->planarUncType); |
|
} |
|
} |
|
} |
|
|
|
OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
|
|
|