///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2009-2014 DreamWorks Animation LLC. 
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// *       Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// *       Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// *       Neither the name of DreamWorks Animation nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////

//---------------------------------------------------
//
// class DwaCompressor -- Store lossy RGB data by quantizing
//                          DCT components.
//
// First, we try and figure out what compression strategy to take
// based in channel name. For RGB channels, we want a lossy method
// described below. But, if we have alpha, we should do something
// different (and probably using RLE). If we have depth, or velocity,
// or something else, just fall back to ZIP. The rules for deciding 
// which strategy to use are setup in initializeDefaultChannelRules().
// When writing a file, the relevant rules needed to decode are written
// into the start of the data block, making a self-contained file. 
// If initializeDefaultChannelRules() doesn't quite suite your naming
// conventions, you can adjust the rules without breaking decoder
// compatability.
//
// If we're going to lossy compress R, G, or B channels, it's easier
// to toss bits in a more perceptual uniform space. One could argue
// at length as to what constitutes perceptually uniform, expecially 
// when storing either scene/input/focal plane referred and output referred
// data. 
//
// We'll compromise. For values <= 1, we use a traditional power function
// (without any of that straight-line business at the bottom). For values > 1,
// we want something more like a log function, since power functions blow
// up. At 1, we want a smooth blend between the functions. So, we use a 
// piecewise function that does just that - see dwaLookups.cpp for 
// a little more detail.
//
// Also, if we find that we have R, G, and B channels from the same layer,
// we can get a bit more compression efficiency by transforming to a Y'CbCr
// space. We use the 709 transform, but with Cb,Cr = 0 for an input of 
// (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point
// makes no sense with large range data. Transforms are done to from 
// the perceptual space data, not the linear-light space data (R'G'B' ->
// (Y'CbCr, not RGB -> YCbCr).
//
// Next, we forward DCT the data. This is done with a floating
// point DCT, as we don't really have control over the src range. The 
// resulting values are dropped to half-float precision. 
//
// Now, we need to quantize. Quantization departs from the usual way 
// of dividing and rounding. Instead, we start with some floating 
// point "base-error" value. From this, we can derive quantization 
// error for each DCT component. Take the standard JPEG quantization
// tables and normalize them by the smallest value. Then, multiply
// the normalized quant tables by our base-error value. This gives
// a range of errors for each DCT component.
//
// For each DCT component, we want to find a quantized value that 
// is within +- the per-component error. Pick the quantized value
// that has the fewest bits set in its' binary representation. 
// Brute-forcing the search would make for extremly inefficient 
// compression. Fortunatly, we can precompute a table to assist 
// with this search. 
//
// For each 16-bit float value, there are at most 15 other values with
// fewer bits set. We can precompute these values in a compact form, since
// many source values have far fewer that 15 possible quantized values. 
// Now, instead of searching the entire range +- the component error,
// we can just search at most 15 quantization candidates. The search can
// be accelerated a bit more by sorting the candidates by the 
// number of bits set, in increasing order. Then, the search can stop
// once a candidate is found w/i the per-component quantization 
// error range.
//
// The quantization strategy has the side-benefit that there is no
// de-quantization step upon decode, so we don't bother recording
// the quantization table.
//
// Ok. So we now have quantized values. Time for entropy coding. We
// can use either static Huffman or zlib/DEFLATE. The static Huffman
// is more efficient at compacting data, but can have a greater 
// overhead, especially for smaller tile/strip sizes. 
//
// There is some additional fun, like ZIP compressing the DC components
// instead of Huffman/zlib, which helps make things slightly smaller.
//
// Compression level is controlled by setting an int/float/double attribute
// on the header named "dwaCompressionLevel". This is a thinly veiled name for 
// the "base-error" value mentioned above. The "base-error" is just
// dwaCompressionLevel / 100000. The default value of 45.0 is generally 
// pretty good at generating "visually lossless" values at reasonable
// data rates. Setting dwaCompressionLevel to 0 should result in no additional
// quantization at the quantization stage (though there may be 
// quantization in practice at the CSC/DCT steps). But if you really
// want lossless compression, there are pleanty of other choices 
// of compressors ;)
//
// When dealing with FLOAT source buffers, we first quantize the source
// to HALF and continue down as we would for HALF source.
//
//---------------------------------------------------


#include "ImfDwaCompressor.h"
#include "ImfDwaCompressorSimd.h"

#include "ImfChannelList.h"
#include "ImfStandardAttributes.h"
#include "ImfHeader.h"
#include "ImfHuf.h"
#include "ImfInt64.h"
#include "ImfIntAttribute.h"
#include "ImfIO.h"
#include "ImfMisc.h"
#include "ImfNamespace.h"
#include "ImfRle.h"
#include "ImfSimd.h"
#include "ImfSystemSpecific.h"
#include "ImfXdr.h"
#include "ImfZip.h"

#include "ImathFun.h"
#include "ImathBox.h"
#include "ImathVec.h"
#include "half.h"
#include "halfLimits.h"

#include "dwaLookups.h"

#include <vector>
#include <string>
#include <cctype>
#include <cassert>
#include <algorithm>

// Windows specific addition to prevent the indirect import of the redefined min/max macros
#if defined _WIN32 || defined _WIN64
	#ifdef NOMINMAX
		#undef NOMINMAX
	#endif
	#define NOMINMAX
#endif
#include <zlib.h>


OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER


namespace {

    //
    // Function pointer to dispatch to an approprate 
    // convertFloatToHalf64_* impl, based on runtime cpu checking.
    // Should be initialized in DwaCompressor::initializeFuncs()
    //

    void (*convertFloatToHalf64)(unsigned short*, float*) =
        convertFloatToHalf64_scalar;

    // 
    // Function pointer for dispatching a fromHalfZigZag_ impl
    //
    
    void (*fromHalfZigZag)(unsigned short*, float*) =
        fromHalfZigZag_scalar;

    //
    // Dispatch the inverse DCT on an 8x8 block, where the last
    // n rows can be all zeros. The n=0 case converts the full block.
    //
    void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>;
    void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>;
    void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>;
    void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>;
    void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>;
    void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>;
    void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>;
    void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>;
    
} // namespace


struct DwaCompressor::ChannelData
{
    std::string         name;
    CompressorScheme    compression;  
    int                 xSampling;
    int                 ySampling;
    PixelType           type;
    bool                pLinear;

    int                 width;
    int                 height;

    //
    // Incoming and outgoing data is scanline interleaved, and it's much
    // easier to operate on contiguous data.  Assuming the planare unc
    // buffer is to hold RLE data, we need to rearrange to make bytes
    // adjacent.
    //

    char               *planarUncBuffer;
    char               *planarUncBufferEnd;

    char               *planarUncRle[4];
    char               *planarUncRleEnd[4];

    PixelType           planarUncType;
    int                 planarUncSize;
};


struct DwaCompressor::CscChannelSet
{
    int idx[3];
};


struct DwaCompressor::Classifier
{
    Classifier (std::string suffix,
                CompressorScheme scheme,
                PixelType type,
                int cscIdx,
                bool caseInsensitive):
        _suffix(suffix),
        _scheme(scheme),
        _type(type),
        _cscIdx(cscIdx),
        _caseInsensitive(caseInsensitive)
    {
        if (caseInsensitive) 
            std::transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower);
    }

    Classifier (const char *&ptr, int size)
    {
        if (size <= 0) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (truncated rule).");
            
        {
            char suffix[Name::SIZE];
            memset (suffix, 0, Name::SIZE);
            Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix);
            _suffix = std::string(suffix);
        }

        if (size < _suffix.length() + 1 + 2*Xdr::size<char>()) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (truncated rule).");

        char value;
        Xdr::read<CharPtrIO> (ptr, value);

        _cscIdx = (int)(value >> 4) - 1;
        if (_cscIdx < -1 || _cscIdx >= 3) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (corrupt cscIdx rule).");

        _scheme = (CompressorScheme)((value >> 2) & 3);
        if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (corrupt scheme rule).");

        _caseInsensitive = (value & 1 ? true : false);

        Xdr::read<CharPtrIO> (ptr, value);
        if (value < 0 || value >= NUM_PIXELTYPES) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (corrupt rule).");
        _type = (PixelType)value;
    }

    bool match (const std::string &suffix, const PixelType type) const
    {
        if (_type != type) return false;

        if (_caseInsensitive) 
        {
            std::string tmp(suffix);
            std::transform(tmp.begin(), tmp.end(), tmp.begin(), tolower);
            return tmp == _suffix;
        }

        return suffix == _suffix;
    }

    size_t size () const 
    {
        // string length + \0
        size_t sizeBytes = _suffix.length() + 1;

        // 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type
        sizeBytes += 2 * Xdr::size<char>();

        return sizeBytes;
    }

    void write (char *&ptr) const
    {
        Xdr::write<CharPtrIO> (ptr, _suffix.c_str());

        // Encode _cscIdx (-1-3) in the upper 4 bits,
        //        _scheme (0-2)  in the next 2 bits
        //        _caseInsen     in the bottom bit
        unsigned char value = 0;
        value |= ((unsigned char)(_cscIdx+1)      & 15) << 4;
        value |= ((unsigned char)_scheme          &  3) << 2;
        value |=  (unsigned char)_caseInsensitive &  1;

        Xdr::write<CharPtrIO> (ptr, value);
        Xdr::write<CharPtrIO> (ptr, (unsigned char)_type);
    }

    std::string      _suffix;
    CompressorScheme _scheme;
    PixelType        _type;
    int              _cscIdx;
    bool             _caseInsensitive;
};


//
// Base class for the LOSSY_DCT decoder classes
//

class DwaCompressor::LossyDctDecoderBase
{
  public:

    LossyDctDecoderBase
        (char *packedAc,
         char *packedDc,
         const unsigned short *toLinear,
         int width,
         int height);

    virtual ~LossyDctDecoderBase ();

    void execute();

    //
    // These return number of items, not bytes. Each item
    // is an unsigned short
    //

    int numAcValuesEncoded() const { return _packedAcCount; }
    int numDcValuesEncoded() const { return _packedDcCount; }

  protected:

    //
    // Un-RLE the packed AC components into 
    // a half buffer. The half block should 
    // be the full 8x8 block (in zig-zag order
    // still), not the first AC component. 
    //
    // currAcComp is advanced as bytes are decoded.
    //
    // This returns the index of the last non-zero
    // value in the buffer - with the index into zig zag
    // order data. If we return 0, we have DC only data.
    // 

    int unRleAc (unsigned short *&currAcComp,
                 unsigned short  *halfZigBlock); 


    //
    // if NATIVE and XDR are really the same values, we can
    // skip some processing and speed things along
    //

    bool                  _isNativeXdr;


    //
    // Counts of how many items have been packed into the
    // AC and DC buffers
    //

    int                   _packedAcCount;
    int                   _packedDcCount;


    //
    // AC and DC buffers to pack
    //

    char                 *_packedAc;
    char                 *_packedDc;


    // 
    // half -> half LUT to transform from nonlinear to linear
    //

    const unsigned short *_toLinear;


    //
    // image dimensions
    //

    int                   _width;
    int                   _height;


    //
    // Pointers to the start of each scanlines, to be filled on decode
    // Generally, these will be filled by the subclasses.
    //

    std::vector< std::vector<char *> > _rowPtrs;


    // 
    // The type of each data that _rowPtrs[i] is referring. Layout
    // is in the same order as _rowPtrs[].
    //

    std::vector<PixelType>             _type;
    std::vector<SimdAlignedBuffer64f>  _dctData;
};


//
// Used to decode a single channel of LOSSY_DCT data.
//

class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase
{
  public:

    //
    // toLinear is a half-float LUT to convert the encoded values 
    // back to linear light. If you want to skip this step, pass
    // in NULL here.
    //

    LossyDctDecoder
        (std::vector<char *> &rowPtrs,
         char *packedAc,
         char *packedDc,
         const unsigned short *toLinear,
         int width,
         int height,
         PixelType type)
    :
        LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
    {
        _rowPtrs.push_back(rowPtrs);
        _type.push_back(type);
    }

    virtual ~LossyDctDecoder () {}
};


//
// Used to decode 3 channels of LOSSY_DCT data that
// are grouped together and color space converted.
//

class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase
{
  public:

    //
    // toLinear is a half-float LUT to convert the encoded values 
    // back to linear light. If you want to skip this step, pass
    // in NULL here.
    //

    LossyDctDecoderCsc
        (std::vector<char *> &rowPtrsR,
         std::vector<char *> &rowPtrsG,
         std::vector<char *> &rowPtrsB,
         char *packedAc,
         char *packedDc,
         const unsigned short *toLinear,
         int width,
         int height,
         PixelType typeR,
         PixelType typeG,
         PixelType typeB)
    :
        LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
    {
        _rowPtrs.push_back(rowPtrsR);
        _rowPtrs.push_back(rowPtrsG);
        _rowPtrs.push_back(rowPtrsB);
        _type.push_back(typeR);
        _type.push_back(typeG);
        _type.push_back(typeB);
    }

    virtual ~LossyDctDecoderCsc () {}
};


// 
// Base class for encoding using the lossy DCT scheme
//

class DwaCompressor::LossyDctEncoderBase
{
  public:

    LossyDctEncoderBase
        (float quantBaseError,
         char *packedAc,
         char *packedDc,
         const unsigned short *toNonlinear,
         int width,
         int height);

    virtual ~LossyDctEncoderBase ();

    void execute ();

    //
    // These return number of items, not bytes. Each item
    // is an unsigned short
    //

    int     numAcValuesEncoded () const {return _numAcComp;}
    int     numDcValuesEncoded () const {return _numDcComp;}

  protected:

    void    toZigZag (half *dst, half *src);
    int     countSetBits (unsigned short src);
    half    quantize (half src, float errorTolerance);
    void    rleAc (half *block, unsigned short *&acPtr);

    float                      _quantBaseError;

    int                        _width,
                               _height;
    const unsigned short      *_toNonlinear;

    int                        _numAcComp,
                               _numDcComp;

    std::vector< std::vector<const char *> > _rowPtrs;
    std::vector<PixelType>                   _type;
    std::vector<SimdAlignedBuffer64f>        _dctData;


    //
    // Pointers to the buffers where AC and DC
    // DCT components should be packed for 
    // lossless compression downstream
    //

    char                      *_packedAc;
    char                      *_packedDc;


    //
    // Our "quantization tables" - the example JPEG tables, 
    // normalized so that the smallest value in each is 1.0.
    // This gives us a relationship between error in DCT 
    // components
    //

    float                      _quantTableY[64];
    float                      _quantTableCbCr[64];
};


//
// Single channel lossy DCT encoder
//

class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase
{
  public:

    LossyDctEncoder
        (float quantBaseError,
         std::vector<const char *> &rowPtrs,
         char *packedAc,
         char *packedDc,
         const unsigned short *toNonlinear,
         int width,
         int height,
         PixelType type)
    :
        LossyDctEncoderBase
            (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
    {
        _rowPtrs.push_back(rowPtrs);
        _type.push_back(type);
    }

    virtual ~LossyDctEncoder () {}
};
    

//
// RGB channel lossy DCT encoder
//

class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase
{
  public:

    LossyDctEncoderCsc
        (float quantBaseError,
         std::vector<const char *> &rowPtrsR,
         std::vector<const char *> &rowPtrsG,
         std::vector<const char *> &rowPtrsB,
         char *packedAc,
         char *packedDc,
         const unsigned short *toNonlinear,
         int width,
         int height,
         PixelType typeR,
         PixelType typeG,
         PixelType typeB)
    :
        LossyDctEncoderBase
            (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
    {
        _type.push_back(typeR);
        _type.push_back(typeG);
        _type.push_back(typeB);

        _rowPtrs.push_back(rowPtrsR);
        _rowPtrs.push_back(rowPtrsG);
        _rowPtrs.push_back(rowPtrsB);
    }

    virtual ~LossyDctEncoderCsc () {}
};


// ==============================================================
//
//                     LossyDctDecoderBase
//
// --------------------------------------------------------------

DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase
    (char *packedAc,
     char *packedDc,
     const unsigned short *toLinear,
     int width,
     int height)
:
    _isNativeXdr(false),
    _packedAcCount(0),
    _packedDcCount(0),
    _packedAc(packedAc),
    _packedDc(packedDc),
    _toLinear(toLinear),
    _width(width),
    _height(height)
{
    if (_toLinear == 0)
        _toLinear = get_dwaCompressorNoOp();

    _isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
}


DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {}


void
DwaCompressor::LossyDctDecoderBase::execute ()
{
    int numComp        = _rowPtrs.size();
    int lastNonZero    = 0;
    int numBlocksX     = (int) ceil ((float)_width  / 8.0f);
    int numBlocksY     = (int) ceil ((float)_height / 8.0f);
    int leftoverX      = _width  - (numBlocksX-1) * 8;
    int leftoverY      = _height - (numBlocksY-1) * 8;

    int numFullBlocksX = (int)floor ((float)_width / 8.0f);

    unsigned short tmpShortNative = 0;
    unsigned short tmpShortXdr    = 0;
    const char *tmpConstCharPtr   = 0;

    unsigned short                    *currAcComp = (unsigned short *)_packedAc;
    std::vector<unsigned short *>      currDcComp (_rowPtrs.size());
    std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size());

    if (_type.size() != _rowPtrs.size())
        throw IEX_NAMESPACE::BaseExc ("Row pointers and types mismatch in count");

    if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1))
        throw IEX_NAMESPACE::NoImplExc ("Only 1 and 3 channel encoding is supported");

    _dctData.resize(numComp);

    //
    // Allocate a temp aligned buffer to hold a rows worth of full 
    // 8x8 half-float blocks
    //

    unsigned char *rowBlockHandle = new unsigned char
        [numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT];

    unsigned short *rowBlock[3];

    rowBlock[0] = (unsigned short*)rowBlockHandle;

    for (int i = 0; i < _SSE_ALIGNMENT; ++i)
    {
        if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0)
            rowBlock[0] = (unsigned short *)(rowBlockHandle + i);
    }

    for (int comp = 1; comp < numComp; ++comp)
        rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64;
 
    //
    // Pack DC components together by common plane, so we can get 
    // a little more out of differencing them. We'll always have 
    // one component per block, so we can computed offsets.
    //

    currDcComp[0] = (unsigned short *)_packedDc;

    for (unsigned int comp = 1; comp < numComp; ++comp)
        currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY;

    for (int blocky = 0; blocky < numBlocksY; ++blocky)
    {
        int maxY = 8;

        if (blocky == numBlocksY-1)
            maxY = leftoverY;

        int maxX = 8;

        for (int blockx = 0; blockx < numBlocksX; ++blockx)
        {
            if (blockx == numBlocksX-1)
                maxX = leftoverX;

            //
            // If we can detect that the block is constant values
            // (all components only have DC values, and all AC is 0),
            // we can do everything only on 1 value, instead of all
            // 64. 
            //
            // This won't really help for regular images, but it is
            // meant more for layers with large swaths of black 
            //

            bool blockIsConstant = true;

            for (unsigned int comp = 0; comp < numComp; ++comp)
            {

                //
                // DC component is stored separately
                //

                #ifdef IMF_HAVE_SSE2
                    {
                        __m128i *dst = (__m128i*)halfZigBlock[comp]._buffer;

                        dst[7] = _mm_setzero_si128();
                        dst[6] = _mm_setzero_si128();
                        dst[5] = _mm_setzero_si128();
                        dst[4] = _mm_setzero_si128();
                        dst[3] = _mm_setzero_si128();
                        dst[2] = _mm_setzero_si128();
                        dst[1] = _mm_setzero_si128();
                        dst[0] = _mm_insert_epi16
                            (_mm_setzero_si128(), *currDcComp[comp]++, 0);
                    }
                #else  /* IMF_HAVE_SSE2 */

                    memset (halfZigBlock[comp]._buffer, 0, 64 * 2);
                    halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++;

                #endif /* IMF_HAVE_SSE2 */

                _packedDcCount++;
                
                //
                // UnRLE the AC. This will modify currAcComp
                //

                lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer);

                //
                // Convert from XDR to NATIVE
                //

                if (!_isNativeXdr)
                {
                    for (int i = 0; i < 64; ++i)
                    {
                        tmpShortXdr      = halfZigBlock[comp]._buffer[i];
                        tmpConstCharPtr  = (const char *)&tmpShortXdr;

                        Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative);

                        halfZigBlock[comp]._buffer[i] = tmpShortNative;
                    }
                }

                if (lastNonZero == 0)
                {
                    //
                    // DC only case - AC components are all 0   
                    //

                    half h;

                    h.setBits (halfZigBlock[comp]._buffer[0]);
                    _dctData[comp]._buffer[0] = (float)h;

                    dctInverse8x8DcOnly (_dctData[comp]._buffer);
                }
                else
                {
                    //
                    // We have some AC components that are non-zero. 
                    // Can't use the 'constant block' optimization
                    //

                    blockIsConstant = false;

                    //
                    // Un-Zig zag 
                    //

                    (*fromHalfZigZag)
                        (halfZigBlock[comp]._buffer, _dctData[comp]._buffer);

                    //
                    // Zig-Zag indices in normal layout are as follows:
                    //
                    // 0   1   5   6   14  15  27  28
                    // 2   4   7   13  16  26  29  42
                    // 3   8   12  17  25  30  41  43
                    // 9   11  18  24  31  40  44  53
                    // 10  19  23  32  39  45  52  54
                    // 20  22  33  38  46  51  55  60
                    // 21  34  37  47  50  56  59  61
                    // 35  36  48  49  57  58  62  63
                    //
                    // If lastNonZero is less than the first item on
                    // each row, we know that the whole row is zero and 
                    // can be skipped in the row-oriented part of the
                    // iDCT. 
                    //
                    // The unrolled logic here is:
                    //
                    //    if lastNonZero < rowStartIdx[i],
                    //    zeroedRows = rowsEmpty[i]
                    //
                    // where:
                    //
                    //    const int rowStartIdx[] = {2, 3, 9, 10, 20, 21, 35};
                    //    const int rowsEmpty[]   = {7, 6, 5,  4,  3,  2,  1};
                    //

                    if (lastNonZero < 2)
                        dctInverse8x8_7(_dctData[comp]._buffer);
                    else if (lastNonZero < 3)
                        dctInverse8x8_6(_dctData[comp]._buffer);
                    else if (lastNonZero < 9)
                        dctInverse8x8_5(_dctData[comp]._buffer);
                    else if (lastNonZero < 10)
                        dctInverse8x8_4(_dctData[comp]._buffer);
                    else if (lastNonZero < 20)
                        dctInverse8x8_3(_dctData[comp]._buffer);
                    else if (lastNonZero < 21)
                        dctInverse8x8_2(_dctData[comp]._buffer);
                    else if (lastNonZero < 35)
                        dctInverse8x8_1(_dctData[comp]._buffer);
                    else
                        dctInverse8x8_0(_dctData[comp]._buffer);
                }
            }

            //
            // Perform the CSC
            //

            if (numComp == 3)
            {
                if (!blockIsConstant)
                {
                    csc709Inverse64 (_dctData[0]._buffer, 
                                     _dctData[1]._buffer, 
                                     _dctData[2]._buffer);

                }
                else
                {
                    csc709Inverse (_dctData[0]._buffer[0], 
                                   _dctData[1]._buffer[0], 
                                   _dctData[2]._buffer[0]);
                }
            }

            //
            // Float -> Half conversion. 
            //
            // If the block has a constant value, just convert the first pixel.
            //

            for (unsigned int comp = 0; comp < numComp; ++comp)
            {
                if (!blockIsConstant)
                {
                    (*convertFloatToHalf64)
                        (&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
                }
                else
                {
                    #ifdef IMF_HAVE_SSE2

                        __m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];

                        dst[0] = _mm_set1_epi16
                            (((half)_dctData[comp]._buffer[0]).bits());

                        dst[1] = dst[0];
                        dst[2] = dst[0];
                        dst[3] = dst[0];
                        dst[4] = dst[0];
                        dst[5] = dst[0];
                        dst[6] = dst[0];
                        dst[7] = dst[0];

                    #else  /* IMF_HAVE_SSE2 */

                        unsigned short *dst = &rowBlock[comp][blockx*64];

                        dst[0] = ((half)_dctData[comp]._buffer[0]).bits();

                        for (int i = 1; i < 64; ++i)
                        {
                            dst[i] = dst[0];
                        }

                    #endif /* IMF_HAVE_SSE2 */
                } // blockIsConstant
            } // comp
        } // blockx

        //
        // At this point, we have half-float nonlinear value blocked
        // in rowBlock[][]. We need to unblock the data, transfer
        // back to linear, and write the results in the _rowPtrs[].
        //
        // There is a fast-path for aligned rows, which helps
        // things a little. Since this fast path is only valid
        // for full 8-element wide blocks, the partial x blocks
        // are broken into a separate loop below.
        //
        // At the moment, the fast path requires:
        //   * sse support
        //   * aligned row pointers
        //   * full 8-element wide blocks
        //

        for (int comp = 0; comp < numComp; ++comp)
        {
            //
            // Test if we can use the fast path
            //

        #ifdef IMF_HAVE_SSE2

            bool fastPath = true;

            for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
            {
                if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
                    fastPath = false;
            }

            if (fastPath)
            {
                //
                // Handle all the full X blocks, in a fast path with sse2 and
                // aligned row pointers
                //

                for (int y=8*blocky; y<8*blocky+maxY; ++y)
                {
                    __m128i *dst = (__m128i *)_rowPtrs[comp][y];
                    __m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];


                    for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
                    {
                        //
                        // These may need some twiddling.
                        // Run with multiples of 8
                        //

                        _mm_prefetch ((char *)(src + 16), _MM_HINT_NTA); 

                        unsigned short i0  = _mm_extract_epi16 (*src, 0);
                        unsigned short i1  = _mm_extract_epi16 (*src, 1);
                        unsigned short i2  = _mm_extract_epi16 (*src, 2);
                        unsigned short i3  = _mm_extract_epi16 (*src, 3);

                        unsigned short i4  = _mm_extract_epi16 (*src, 4);
                        unsigned short i5  = _mm_extract_epi16 (*src, 5);
                        unsigned short i6  = _mm_extract_epi16 (*src, 6);
                        unsigned short i7  = _mm_extract_epi16 (*src, 7);

                        i0 = _toLinear[i0];
                        i1 = _toLinear[i1];
                        i2 = _toLinear[i2];
                        i3 = _toLinear[i3];

                        i4 = _toLinear[i4];
                        i5 = _toLinear[i5];
                        i6 = _toLinear[i6];
                        i7 = _toLinear[i7];

                        *dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0);
                        *dst = _mm_insert_epi16 (*dst, i1, 1);
                        *dst = _mm_insert_epi16 (*dst, i2, 2);
                        *dst = _mm_insert_epi16 (*dst, i3, 3);

                        *dst = _mm_insert_epi16 (*dst, i4, 4);
                        *dst = _mm_insert_epi16 (*dst, i5, 5);
                        *dst = _mm_insert_epi16 (*dst, i6, 6);
                        *dst = _mm_insert_epi16 (*dst, i7, 7);

                        src += 8;
                        dst++;
                    }
                }
            }
            else
            {

        #endif /* IMF_HAVE_SSE2 */

                //
                // Basic scalar kinda slow path for handling the full X blocks
                //

                for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
                {
                    unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];

                    for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
                    {
                        unsigned short *src =
                            &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];

                        dst[0] = _toLinear[src[0]];
                        dst[1] = _toLinear[src[1]];
                        dst[2] = _toLinear[src[2]];
                        dst[3] = _toLinear[src[3]];

                        dst[4] = _toLinear[src[4]];
                        dst[5] = _toLinear[src[5]];
                        dst[6] = _toLinear[src[6]];
                        dst[7] = _toLinear[src[7]];

                        dst += 8;
                    }
                }

        #ifdef IMF_HAVE_SSE2

            }

        #endif /* IMF_HAVE_SSE2 */

            //
            // If we have partial X blocks, deal with all those now
            // Since this should be minimal work, there currently
            // is only one path that should work for everyone.
            //

            if (numFullBlocksX != numBlocksX)
            {
                for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
                {
                    unsigned short *src = (unsigned short *)
                        &rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)];

                    unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];

                    dst += 8 * numFullBlocksX;

                    for (int x = 0; x < maxX; ++x)
                    {
                        *dst++ = _toLinear[*src++];
                    }
                }
            }
        } // comp
    } // blocky

    //
    // Walk over all the channels that are of type FLOAT.
    // Convert from HALF XDR back to FLOAT XDR.
    //

    for (unsigned int chan = 0; chan < numComp; ++chan)
    {

        if (_type[chan] != FLOAT)
            continue;

        std::vector<unsigned short> halfXdr (_width);

        for (int y=0; y<_height; ++y)
        {
            char *floatXdrPtr = _rowPtrs[chan][y];

            memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short));

            const char *halfXdrPtr = (const char *)(&halfXdr[0]);

            for (int x=0; x<_width; ++x)
            {
                half tmpHalf;

                Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf);
                Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf);

                // 
                // Xdr::write and Xdr::read will advance the ptrs
                //
            }
        }
    }

    delete[] rowBlockHandle;
}


//
// Un-RLE the packed AC components into 
// a half buffer. The half block should 
// be the full 8x8 block (in zig-zag order
// still), not the first AC component. 
//
// currAcComp is advanced as bytes are decoded.
//
// This returns the index of the last non-zero
// value in the buffer - with the index into zig zag
// order data. If we return 0, we have DC only data.
// 
// This is assuminging that halfZigBlock is zero'ed
// prior to calling
//

int 
DwaCompressor::LossyDctDecoderBase::unRleAc
    (unsigned short *&currAcComp,
     unsigned short  *halfZigBlock) 
{
    //
    // Un-RLE the RLE'd blocks. If we find an item whose
    // high byte is 0xff, then insert the number of 0's
    // as indicated by the low byte.
    //
    // Otherwise, just copy the number verbaitm.
    //

    int lastNonZero          = 0;
    int dctComp              = 1; 

    //
    // Start with a zero'ed block, so we don't have to
    // write when we hit a run symbol
    //

    while (dctComp < 64)
    {
        if (*currAcComp == 0xff00)
        {
            // 
            // End of block
            //

            dctComp = 64;

        }
        else if ((*currAcComp) >> 8 == 0xff)
        {
            //
            // Run detected! Insert 0's.
            //
            // Since the block has been zeroed, just advance the ptr
            // 

            dctComp += (*currAcComp) & 0xff; 
        }
        else
        {
            // 
            // Not a run, just copy over the value
            //

            lastNonZero = dctComp;
            halfZigBlock[dctComp] = *currAcComp;

            dctComp++;
        }

        _packedAcCount++;
        currAcComp++;
    }

    return lastNonZero;
}


// ==============================================================
//
//                     LossyDctEncoderBase
//
// --------------------------------------------------------------

DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase
    (float quantBaseError,
     char *packedAc,
     char *packedDc,
     const unsigned short *toNonlinear,
     int width,
     int height)
:
    _quantBaseError(quantBaseError),
    _width(width),
    _height(height),
    _toNonlinear(toNonlinear),
    _numAcComp(0),
    _numDcComp(0),
    _packedAc(packedAc),
    _packedDc(packedDc)
{
    //
    // Here, we take the generic JPEG quantization tables and
    // normalize them by the smallest component in each table.
    // This gives us a relationship amongst the DCT components,
    // in terms of how sensitive each component is to
    // error.
    //
    // A higher normalized value means we can quantize more,
    // and a small normalized value means we can quantize less.
    //
    // Eventually, we will want an acceptable quantization
    // error range for each component. We find this by
    // multiplying some user-specified level (_quantBaseError)
    // by the normalized table (_quantTableY, _quantTableCbCr) to
    // find the acceptable quantization error range.
    //
    // The quantization table is not needed for decoding, and
    // is not transmitted. So, if you want to get really fancy,
    // you could derive some content-dependent quantization
    // table, and the decoder would not need to be changed. But,
    // for now, we'll just use statice quantization tables.
    //

    int jpegQuantTableY[] =
    {
        16,  11,  10,  16,   24,   40,   51,   61,
        12,  12,  14,  19,   26,   58,   60,   55,
        14,  13,  16,  24,   40,   57,   69,   56,
        14,  17,  22,  29,   51,   87,   80,   62,
        18,  22,  37,  56,   68,  109,  103,   77,
        24,  35,  55,  64,   81,  104,  113,   92,
        49,  64,  78,  87,  103,  121,  120,  101,
        72,  92,  95,  98,  112,  100,  103,   99
    };

    int jpegQuantTableYMin = 10;

    int jpegQuantTableCbCr[] =
    {
        17,  18,  24,  47,  99,  99,  99,  99,
        18,  21,  26,  66,  99,  99,  99,  99,
        24,  26,  56,  99,  99,  99,  99,  99,
        47,  66,  99,  99,  99,  99,  99,  99,
        99,  99,  99,  99,  99,  99,  99,  99,
        99,  99,  99,  99,  99,  99,  99,  99,
        99,  99,  99,  99,  99,  99,  99,  99,
        99,  99,  99,  99,  99,  99,  99,  99
    };

    int jpegQuantTableCbCrMin = 17;

    for (int idx = 0; idx < 64; ++idx)
    {
        _quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) /
                            static_cast<float> (jpegQuantTableYMin);

        _quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) /
                               static_cast<float> (jpegQuantTableCbCrMin);
    }
    
    if (_quantBaseError < 0)
        quantBaseError = 0;
}


DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase () 
{
}


//
// Given three channels of source data, encoding by first applying
// a color space conversion to a YCbCr space.  Otherwise, if we only
// have one channel, just encode it as is. 
//
// Other numbers of channels are somewhat unexpected at this point,
// and will throw an exception.
//

void
DwaCompressor::LossyDctEncoderBase::execute ()
{
    int  numBlocksX   = (int)ceil ((float)_width / 8.0f);
    int  numBlocksY   = (int)ceil ((float)_height/ 8.0f);

    half halfZigCoef[64]; 
    half halfCoef[64];

    std::vector<unsigned short *> currDcComp (_rowPtrs.size());
    unsigned short               *currAcComp = (unsigned short *)_packedAc;

    _dctData.resize (_rowPtrs.size());
    _numAcComp = 0;
    _numDcComp = 0;
 
    assert (_type.size() == _rowPtrs.size());
    assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1));

    // 
    // Allocate a temp half buffer to quantize into for
    // any FLOAT source channels.
    //

    int tmpHalfBufferElements = 0;

    for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
        if (_type[chan] == FLOAT)
            tmpHalfBufferElements += _width * _height;

    std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements);

    char *tmpHalfBufferPtr = 0;

    if (tmpHalfBufferElements)
        tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0];

    //
    // Run over all the float scanlines, quantizing, 
    // and re-assigning _rowPtr[y]. We need to translate
    // FLOAT XDR to HALF XDR.
    //

    for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
    {
        if (_type[chan] != FLOAT)
            continue;
    
        for (int y = 0; y < _height; ++y)
        {
            float       src = 0;
            const char *srcXdr = _rowPtrs[chan][y];
            char       *dstXdr = tmpHalfBufferPtr;
           
            for (int x = 0; x < _width; ++x)
            {

                Xdr::read<CharPtrIO> (srcXdr, src);

                //
                // Clamp to half ranges, instead of just casting. This
                // avoids introducing Infs which end up getting zeroed later
                //
                src = std::max (
                    std::min ((float) std::numeric_limits<half>::max(), src),
                              (float)-std::numeric_limits<half>::max());

                Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits());

                //
                // Xdr::read and Xdr::write will advance the ptr
                //
            }

            _rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr;
            tmpHalfBufferPtr += _width * sizeof (unsigned short);
        }
    }

    //
    // Pack DC components together by common plane, so we can get 
    // a little more out of differencing them. We'll always have 
    // one component per block, so we can computed offsets.
    //

    currDcComp[0] = (unsigned short *)_packedDc;

    for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan)
        currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY;

    for (int blocky = 0; blocky < numBlocksY; ++blocky)
    {
        for (int blockx = 0; blockx < numBlocksX; ++blockx)
        {
            half           h;
            unsigned short tmpShortXdr, tmpShortNative;
            char          *tmpCharPtr;

            for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
            {
                //
                // Break the source into 8x8 blocks. If we don't
                // fit at the edges, mirror.
                //
                // Also, convert from linear to nonlinear representation.
                // Our source is assumed to be XDR, and we need to convert
                // to NATIVE prior to converting to float.
                //
                // If we're converting linear -> nonlinear, assume that the
                // XDR -> NATIVE conversion is built into the lookup. Otherwise,
                // we'll need to explicitly do it.
                //

                for (int y = 0; y < 8; ++y)
                {
                    for (int x = 0; x < 8; ++x)
                    {
                        int vx = 8 * blockx + x;
                        int vy = 8 * blocky + y;

                        if (vx >= _width)
                            vx = _width - (vx - (_width - 1));
                        
                        if (vx < 0) vx = _width-1;

                        if (vy >=_height)
                            vy = _height - (vy - (_height - 1));

                        if (vy < 0) vy = _height-1;
                    
                        tmpShortXdr =
                            ((const unsigned short *)(_rowPtrs[chan])[vy])[vx];

                        if (_toNonlinear)
                        {
                            h.setBits (_toNonlinear[tmpShortXdr]);
                        }
                        else
                        {
                            const char *tmpConstCharPtr =
                                (const char *)(&tmpShortXdr);

                            Xdr::read<CharPtrIO>
                                (tmpConstCharPtr, tmpShortNative);

                            h.setBits(tmpShortNative);
                        }

                        _dctData[chan]._buffer[y * 8 + x] = (float)h;
                    } // x
                } // y
            } // chan

            //
            // Color space conversion
            //

            if (_rowPtrs.size() == 3)
            {
                csc709Forward64 (_dctData[0]._buffer, 
                                 _dctData[1]._buffer, 
                                 _dctData[2]._buffer);
            }

            for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
            {
                //
                // Forward DCT
                //

                dctForward8x8(_dctData[chan]._buffer);

                //
                // Quantize to half, and zigzag
                //

                if (chan == 0)
                {
                    for (int i = 0; i < 64; ++i)
                    {
                        halfCoef[i] =
                            quantize ((half)_dctData[chan]._buffer[i],
                                      _quantBaseError*_quantTableY[i]);
                    }
                }
                else
                {
                    for (int i = 0; i < 64; ++i)
                    {
                        halfCoef[i] =
                            quantize ((half)_dctData[chan]._buffer[i],
                                      _quantBaseError*_quantTableCbCr[i]);
                    }
                }

                toZigZag (halfZigCoef, halfCoef);
                
                //
                // Convert from NATIVE back to XDR, before we write out
                //

                for (int i = 0; i < 64; ++i)
                {
                    tmpCharPtr = (char *)&tmpShortXdr;
                    Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits());
                    halfZigCoef[i].setBits(tmpShortXdr);
                }

                //
                // Save the DC component separately, to be compressed on
                // its own.
                //

                *currDcComp[chan]++ = halfZigCoef[0].bits();
                _numDcComp++;
                
                //
                // Then RLE the AC components (which will record the count
                // of the resulting number of items)
                //

                rleAc (halfZigCoef, currAcComp);
            } // chan
        } // blockx
    } // blocky              
}


// 
// Reorder from zig-zag order to normal ordering
//

void 
DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src) 
{
    const int remap[] =
    {
         0, 
         1,  8,
        16,  9,  2,
         3, 10, 17, 24,
        32, 25, 18, 11, 4,
         5, 12, 19, 26, 33, 40,
        48, 41, 34, 27, 20, 13, 6,
         7, 14, 21, 28, 35, 42, 49, 56,
            57, 50, 43, 36, 29, 22, 15,
                23, 30, 37, 44, 51, 58,
                    59, 52, 45, 38, 31,
                        39, 46, 53, 60,
                            61, 54, 47,
                                55, 62,
                                    63
    };

    for (int i=0; i<64; ++i)
        dst[i] = src[remap[i]];
}


//
// Precomputing the bit count runs faster than using
// the builtin instruction, at least in one case..
//
// Precomputing 8-bits is no slower than 16-bits,
// and saves a fair bit of overhead..
//

int
DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src)
{
    static const unsigned short numBitsSet[256] =
    {
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    };

    return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
}


//
// Take a DCT coefficient, as well as an acceptable error. Search
// nearby values within the error tolerance, that have fewer 
// bits set.
//
// The list of candidates has been pre-computed and sorted 
// in order of increasing numbers of bits set. This way, we
// can stop searching as soon as we find a candidate that
// is within the error tolerance.
//

half
DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance)
{
    half            tmp;
    float           srcFloat      = (float)src;
    int             numSetBits    = countSetBits(src.bits());
    const unsigned short *closest = get_dwaClosest(src.bits());

    for (int targetNumSetBits = numSetBits - 1;
         targetNumSetBits >= 0;
         --targetNumSetBits)
    {
        tmp.setBits (*closest);

        if (fabs ((float)tmp - srcFloat) < errorTolerance)
            return tmp;

        closest++;
    }

    return src;
}


//
// RLE the zig-zag of the AC components + copy over 
// into another tmp buffer
//
// Try to do a simple RLE scheme to reduce run's of 0's. This
// differs from the jpeg EOB case, since EOB just indicates that
// the rest of the block is zero. In our case, we have lots of
// NaN symbols, which shouldn't be allowed to occur in DCT 
// coefficents - so we'll use them for encoding runs.
//
// If the high byte is 0xff, then we have a run of 0's, of length
// given by the low byte. For example, 0xff03 would be a run
// of 3 0's, starting at the current location.
//
// block is our block of 64 coefficients
// acPtr a pointer to back the RLE'd values into.
//
// This will advance the counter, _numAcComp.
//

void
DwaCompressor::LossyDctEncoderBase::rleAc
    (half *block,
     unsigned short *&acPtr)
{
    int dctComp              = 1; 
    unsigned short rleSymbol = 0x0;

    while (dctComp < 64)
    {
        int runLen = 1;
    
        //
        // If we don't have a 0, output verbatim
        //

        if (block[dctComp].bits() != rleSymbol)
        {
            *acPtr++ =  block[dctComp].bits();
            _numAcComp++;

            dctComp += runLen;
            continue;
        }

        //
        // We're sitting on a 0, so see how big the run is.
        //

        while ((dctComp+runLen < 64) && 
               (block[dctComp+runLen].bits() == rleSymbol))
        {
            runLen++;
        }

        //
        // If the run len is too small, just output verbatim
        // otherwise output our run token
        //
        // Originally, we wouldn't have a separate symbol for
        // "end of block". But in some experimentation, it looks
        // like using 0xff00 for "end of block" can save a bit
        // of space. 
        //

        if (runLen == 1)
        {
            runLen           = 1;
            *acPtr++ = block[dctComp].bits();
            _numAcComp++;

            //
            // Using 0xff00 for "end of block"
            //
        }
        else if (runLen + dctComp == 64)
        {
            //
            // Signal EOB
            //

            *acPtr++ = 0xff00;
            _numAcComp++;
        }
        else
        {
            // 
            // Signal normal run
            //

            *acPtr++   = 0xff00 | runLen;
            _numAcComp++;
        }

        //
        // Advance by runLen
        //

        dctComp += runLen;
    }
}


// ==============================================================
//
//                     DwaCompressor
//
// --------------------------------------------------------------

// 
// DwaCompressor()
//

DwaCompressor::DwaCompressor
    (const Header &hdr,
     int maxScanLineSize,
     int numScanLines,
     AcCompression acCompression)
:
    Compressor(hdr),
    _acCompression(acCompression),
    _maxScanLineSize(maxScanLineSize),
    _numScanLines(numScanLines),
    _channels(hdr.channels()),
    _packedAcBuffer(0),
    _packedAcBufferSize(0),
    _packedDcBuffer(0),
    _packedDcBufferSize(0),
    _rleBuffer(0),
    _rleBufferSize(0),
    _outBuffer(0),
    _outBufferSize(0),
    _zip(0),
    _dwaCompressionLevel(45.0)
{
    _min[0] = hdr.dataWindow().min.x;
    _min[1] = hdr.dataWindow().min.y;
    _max[0] = hdr.dataWindow().max.x;
    _max[1] = hdr.dataWindow().max.y;

    for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i) 
    {
        _planarUncBuffer[i] = 0;
        _planarUncBufferSize[i] = 0;
    }
    
    //
    // Check the header for a quality attribute
    //

    if (hasDwaCompressionLevel (hdr))
        _dwaCompressionLevel = dwaCompressionLevel (hdr);
}


DwaCompressor::~DwaCompressor()
{
    delete[] _packedAcBuffer;
    delete[] _packedDcBuffer;
    delete[] _rleBuffer;
    delete[] _outBuffer;
    delete _zip;

    for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
        delete[] _planarUncBuffer[i];
}


int
DwaCompressor::numScanLines() const
{
    return _numScanLines;
}


OPENEXR_IMF_NAMESPACE::Compressor::Format 
DwaCompressor::format() const
{
    if (GLOBAL_SYSTEM_LITTLE_ENDIAN)
        return NATIVE;
    else
        return XDR;
}


int
DwaCompressor::compress
    (const char *inPtr,
     int inSize,
     int minY,
     const char *&outPtr)
{
    return compress
        (inPtr,
         inSize, 
         IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY),
                                 IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)),
         outPtr);
}


int
DwaCompressor::compressTile
    (const char             *inPtr,
     int                    inSize,
     IMATH_NAMESPACE::Box2i range,
     const char             *&outPtr)
{
    return compress (inPtr, inSize, range, outPtr);
}


int 
DwaCompressor::compress
    (const char             *inPtr,
     int                    inSize,
     IMATH_NAMESPACE::Box2i range,
     const char             *&outPtr)
{
    const char *inDataPtr   = inPtr;
    char       *packedAcEnd = 0;
    char       *packedDcEnd = 0; 
    int         fileVersion = 2;   // Starting with 2, we write the channel
                                   // classification rules into the file

    if (fileVersion < 2) 
        initializeLegacyChannelRules();
    else 
        initializeDefaultChannelRules();

    size_t outBufferSize = 0;
    initializeBuffers(outBufferSize);

    unsigned short          channelRuleSize = 0;
    std::vector<Classifier> channelRules;
    if (fileVersion >= 2) 
    {
        relevantChannelRules(channelRules);

        channelRuleSize = Xdr::size<unsigned short>();
        for (size_t i = 0; i < channelRules.size(); ++i) 
            channelRuleSize += channelRules[i].size();
    }

    //
    // Remember to allocate _outBuffer, if we haven't done so already.
    //

    outBufferSize += channelRuleSize;
    if (outBufferSize > _outBufferSize) 
    {
        _outBufferSize = outBufferSize;
        if (_outBuffer != 0)
            delete[] _outBuffer;       
        _outBuffer = new char[outBufferSize];
    }

    char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64) +
                                   channelRuleSize];

    //
    // We might not be dealing with any color data, in which
    // case the AC buffer size will be 0, and deferencing
    // a vector will not be a good thing to do.
    //

    if (_packedAcBuffer)
        packedAcEnd = _packedAcBuffer;

    if (_packedDcBuffer)
        packedDcEnd = _packedDcBuffer;

    #define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)]

    Int64 *version                 = OBIDX (VERSION);
    Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE);
    Int64 *unknownCompressedSize   = OBIDX (UNKNOWN_COMPRESSED_SIZE);
    Int64 *acCompressedSize        = OBIDX (AC_COMPRESSED_SIZE);
    Int64 *dcCompressedSize        = OBIDX (DC_COMPRESSED_SIZE);
    Int64 *rleCompressedSize       = OBIDX (RLE_COMPRESSED_SIZE);
    Int64 *rleUncompressedSize     = OBIDX (RLE_UNCOMPRESSED_SIZE);
    Int64 *rleRawSize              = OBIDX (RLE_RAW_SIZE);

    Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT);
    Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT);

    Int64 *acCompression            = OBIDX (AC_COMPRESSION);

    int minX   = range.min.x;
    int maxX   = std::min(range.max.x, _max[0]);
    int minY   = range.min.y;
    int maxY   = std::min(range.max.y, _max[1]);

    //
    // Zero all the numbers in the chunk header
    //

    memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64));

    //
    // Setup the AC compression strategy and the version in the data block,
    // then write the relevant channel classification rules if needed
    //
    *version       = fileVersion;  
    *acCompression = _acCompression;

    setupChannelData (minX, minY, maxX, maxY);

    if (fileVersion >= 2) 
    {
        char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(OPENEXR_IMF_NAMESPACE::Int64)];
        Xdr::write<CharPtrIO> (writePtr, channelRuleSize);
        
        for (size_t i = 0; i < channelRules.size(); ++i) 
            channelRules[i].write(writePtr);
    }

    //
    // Determine the start of each row in the input buffer
    // Channels are interleaved by scanline
    //

    std::vector<bool> encodedChannels (_channelData.size());
    std::vector< std::vector<const char *> > rowPtrs (_channelData.size());

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
        encodedChannels[chan] = false;

    inDataPtr =  inPtr;

    for (int y = minY; y <= maxY; ++y)
    {
        for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
        {

            ChannelData *cd = &_channelData[chan];

            if (IMATH_NAMESPACE::modp(y, cd->ySampling) != 0)
                continue;

            rowPtrs[chan].push_back(inDataPtr);
            inDataPtr += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);
        }
    }

    inDataPtr = inPtr;

    // 
    // Make a pass over all our CSC sets and try to encode them first
    // 

    for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
    {

        LossyDctEncoderCsc encoder
            (_dwaCompressionLevel / 100000.f,
             rowPtrs[_cscSets[csc].idx[0]],
             rowPtrs[_cscSets[csc].idx[1]],
             rowPtrs[_cscSets[csc].idx[2]],
             packedAcEnd,
             packedDcEnd,
             get_dwaCompressorToNonlinear(),
             _channelData[_cscSets[csc].idx[0]].width,
             _channelData[_cscSets[csc].idx[0]].height,
             _channelData[_cscSets[csc].idx[0]].type,
             _channelData[_cscSets[csc].idx[1]].type,
             _channelData[_cscSets[csc].idx[2]].type);

        encoder.execute();

        *totalAcUncompressedCount  += encoder.numAcValuesEncoded();
        *totalDcUncompressedCount  += encoder.numDcValuesEncoded();

        packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short);
        packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short);

        encodedChannels[_cscSets[csc].idx[0]] = true;
        encodedChannels[_cscSets[csc].idx[1]] = true;
        encodedChannels[_cscSets[csc].idx[2]] = true;
    }

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
    {
        ChannelData *cd = &_channelData[chan];

        if (encodedChannels[chan])
            continue;

        switch (cd->compression)
        {
          case LOSSY_DCT:

            //
            // For LOSSY_DCT, treat this just like the CSC'd case,
            // but only operate on one channel
            //

            {
                const unsigned short *nonlinearLut = 0;

                if (!cd->pLinear)
                    nonlinearLut = get_dwaCompressorToNonlinear(); 

                LossyDctEncoder encoder
                    (_dwaCompressionLevel / 100000.f,
                     rowPtrs[chan],
                     packedAcEnd,
                     packedDcEnd,
                     nonlinearLut,
                     cd->width,
                     cd->height,
                     cd->type);

                encoder.execute();

                *totalAcUncompressedCount  += encoder.numAcValuesEncoded();
                *totalDcUncompressedCount  += encoder.numDcValuesEncoded();

                packedAcEnd +=
                    encoder.numAcValuesEncoded() * sizeof (unsigned short);

                packedDcEnd +=
                    encoder.numDcValuesEncoded() * sizeof (unsigned short);
            }

            break;

          case RLE:

            //
            // For RLE, bash the bytes up so that the first bytes of each
            // pixel are contingous, as are the second bytes, and so on.
            //

            for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
            {
                const char *row = rowPtrs[chan][y];

                for (int x = 0; x < cd->width; ++x)
                {
                    for (int byte = 0;
                         byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
                         ++byte)
                    {
                            
                        *cd->planarUncRleEnd[byte]++ = *row++;
                    }
                }

                *rleRawSize += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);
            }

            break;

          case UNKNOWN:
           
            //
            // Otherwise, just copy data over verbatim
            //

            {
                int scanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type);

                for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
                {
                    memcpy (cd->planarUncBufferEnd,
                            rowPtrs[chan][y],
                            scanlineSize);
    
                    cd->planarUncBufferEnd += scanlineSize;
                }

                *unknownUncompressedSize += cd->planarUncSize;
            }

            break;

          default:

            assert (false);
        }

        encodedChannels[chan] = true;
    }

    //
    // Pack the Unknown data into the output buffer first. Instead of
    // just copying it uncompressed, try zlib compression at least.
    //

    if (*unknownUncompressedSize > 0)
    {
        uLongf inSize  = (uLongf)(*unknownUncompressedSize);
        uLongf outSize = compressBound (inSize);

        if (Z_OK != ::compress2 ((Bytef *)outDataPtr,
                                 &outSize,
                                 (const Bytef *)_planarUncBuffer[UNKNOWN],
                                 inSize,
                                 9))
        {
            throw IEX_NAMESPACE::BaseExc ("Data compression (zlib) failed.");
        }

        outDataPtr += outSize;
        *unknownCompressedSize = outSize;
    }

    //
    // Now, pack all the Lossy DCT coefficients into our output
    // buffer, with Huffman encoding.
    //
    // Also, record the compressed size and the number of 
    // uncompressed componentns we have.
    //

    if (*totalAcUncompressedCount > 0)
    { 
        switch (_acCompression)
        {
          case STATIC_HUFFMAN:

            *acCompressedSize = (int)
                hufCompress((unsigned short *)_packedAcBuffer,
                            (int)*totalAcUncompressedCount,
                            outDataPtr);                
            break;

          case DEFLATE:

            {
                uLongf destLen = compressBound (
                    (*totalAcUncompressedCount) * sizeof (unsigned short));

                if (Z_OK != ::compress2
                                ((Bytef *)outDataPtr,
                                 &destLen,
                                 (Bytef *)_packedAcBuffer, 
                                 (uLong)(*totalAcUncompressedCount
                                                * sizeof (unsigned short)),
                                 9))
                {
                    throw IEX_NAMESPACE::InputExc ("Data compression (zlib) failed.");
                }

                *acCompressedSize = destLen;        
            }

            break;

          default:
            
            assert (false);
        }

        outDataPtr += *acCompressedSize;
    }

    // 
    // Handle the DC components separately
    //

    if (*totalDcUncompressedCount > 0)
    {
        *dcCompressedSize = _zip->compress
            (_packedDcBuffer,
             (int)(*totalDcUncompressedCount) * sizeof (unsigned short),
             outDataPtr);

        outDataPtr += *dcCompressedSize;
    }

    // 
    // If we have RLE data, first RLE encode it and set the uncompressed
    // size. Then, deflate the results and set the compressed size.
    //    

    if (*rleRawSize > 0)
    {
        *rleUncompressedSize = rleCompress
            ((int)(*rleRawSize),
             _planarUncBuffer[RLE],
             (signed char *)_rleBuffer);

        uLongf dstLen = compressBound ((uLongf)*rleUncompressedSize);

        if (Z_OK != ::compress2
                        ((Bytef *)outDataPtr, 
                         &dstLen, 
                         (Bytef *)_rleBuffer, 
                         (uLong)(*rleUncompressedSize),
                         9))
        {
            throw IEX_NAMESPACE::BaseExc ("Error compressing RLE'd data.");
        }
        
       *rleCompressedSize = dstLen;
        outDataPtr       += *rleCompressedSize;
    }

    // 
    // Flip the counters to XDR format
    //         

    for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
    {
        Int64  src = *(((Int64 *)_outBuffer) + i);
        char  *dst = (char *)(((Int64 *)_outBuffer) + i);

        Xdr::write<CharPtrIO> (dst, src);
    }

    //
    // We're done - compute the number of bytes we packed
    //

    outPtr = _outBuffer;

    return static_cast<int>(outDataPtr - _outBuffer + 1);
}


int
DwaCompressor::uncompress
    (const char *inPtr,
     int inSize,
     int minY,
     const char *&outPtr)
{
    return uncompress (inPtr,
                       inSize,
                       IMATH_NAMESPACE::Box2i (IMATH_NAMESPACE::V2i (_min[0], minY),
                       IMATH_NAMESPACE::V2i (_max[0], minY + numScanLines() - 1)),
                       outPtr);
}


int 
DwaCompressor::uncompressTile
    (const char *inPtr,
     int inSize,
     IMATH_NAMESPACE::Box2i range,
     const char *&outPtr)
{
    return uncompress (inPtr, inSize, range, outPtr);
}


int 
DwaCompressor::uncompress
    (const char *inPtr,
     int inSize,
     IMATH_NAMESPACE::Box2i range,
     const char *&outPtr)
{
    int minX = range.min.x;
    int maxX = std::min (range.max.x, _max[0]);
    int minY = range.min.y;
    int maxY = std::min (range.max.y, _max[1]);

    int headerSize = NUM_SIZES_SINGLE*sizeof(Int64);
    if (inSize < headerSize) 
    {
        throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                            "(truncated header).");
    }

    // 
    // Flip the counters from XDR to NATIVE
    //

    for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
    {
        Int64      *dst =  (((Int64 *)inPtr) + i);
        const char *src = (char *)(((Int64 *)inPtr) + i);

        Xdr::read<CharPtrIO> (src, *dst);
    }

    //
    // Unwind all the counter info
    //

    const Int64 *inPtr64 = (const Int64*) inPtr;

    Int64 version                  = *(inPtr64 + VERSION);
    Int64 unknownUncompressedSize  = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE);
    Int64 unknownCompressedSize    = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE);
    Int64 acCompressedSize         = *(inPtr64 + AC_COMPRESSED_SIZE);
    Int64 dcCompressedSize         = *(inPtr64 + DC_COMPRESSED_SIZE);
    Int64 rleCompressedSize        = *(inPtr64 + RLE_COMPRESSED_SIZE);
    Int64 rleUncompressedSize      = *(inPtr64 + RLE_UNCOMPRESSED_SIZE);
    Int64 rleRawSize               = *(inPtr64 + RLE_RAW_SIZE);
 
    Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT); 
    Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT); 

    Int64 acCompression            = *(inPtr64 + AC_COMPRESSION); 

    Int64 compressedSize           = unknownCompressedSize + 
                                     acCompressedSize +
                                     dcCompressedSize +
                                     rleCompressedSize;

    const char *dataPtr            = inPtr + NUM_SIZES_SINGLE * sizeof(Int64);

    /* Both the sum and individual sizes are checked in case of overflow. */
    if (inSize < (headerSize + compressedSize) ||
        inSize < unknownCompressedSize ||
        inSize < acCompressedSize ||
        inSize < dcCompressedSize ||
        inSize < rleCompressedSize)
    {
        throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                            "(truncated file).");
    }

    if ((SInt64)unknownUncompressedSize < 0  ||
        (SInt64)unknownCompressedSize < 0    ||
        (SInt64)acCompressedSize < 0         ||
        (SInt64)dcCompressedSize < 0         ||
        (SInt64)rleCompressedSize < 0        ||
        (SInt64)rleUncompressedSize < 0      ||
        (SInt64)rleRawSize < 0               ||
        (SInt64)totalAcUncompressedCount < 0 ||
        (SInt64)totalDcUncompressedCount < 0)
    {
        throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                            " (corrupt header).");
    }

    if (version < 2) 
        initializeLegacyChannelRules();
    else
    {
        unsigned short ruleSize = 0;
        Xdr::read<CharPtrIO>(dataPtr, ruleSize);

        if (ruleSize < 0) 
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (corrupt header file).");

        headerSize += ruleSize;
        if (inSize < headerSize + compressedSize)
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                " (truncated file).");

        _channelRules.clear();
        ruleSize -= Xdr::size<unsigned short> ();
        while (ruleSize > 0) 
        {
            Classifier rule(dataPtr, ruleSize);
            
            _channelRules.push_back(rule);
            ruleSize -= rule.size();
        }
    }


    size_t outBufferSize = 0;
    initializeBuffers(outBufferSize);

    //
    // Allocate _outBuffer, if we haven't done so already
    //

    if (_maxScanLineSize * numScanLines() > _outBufferSize) 
    {
        _outBufferSize = _maxScanLineSize * numScanLines();
        if (_outBuffer != 0)
            delete[] _outBuffer;
        _outBuffer = new char[_maxScanLineSize * numScanLines()];
    }


    char *outBufferEnd = _outBuffer;

       
    //
    // Find the start of the RLE packed AC components and
    // the DC components for each channel. This will be handy   
    // if you want to decode the channels in parallel later on.
    //

    char *packedAcBufferEnd = 0; 

    if (_packedAcBuffer)
        packedAcBufferEnd = _packedAcBuffer;

    char *packedDcBufferEnd = 0;

    if (_packedDcBuffer)
        packedDcBufferEnd = _packedDcBuffer;

    //
    // UNKNOWN data is packed first, followed by the 
    // Huffman-compressed AC, then the DC values, 
    // and then the zlib compressed RLE data.
    //
    
    const char *compressedUnknownBuf = dataPtr;

    const char *compressedAcBuf      = compressedUnknownBuf + 
                                  static_cast<ptrdiff_t>(unknownCompressedSize);
    const char *compressedDcBuf      = compressedAcBuf +
                                  static_cast<ptrdiff_t>(acCompressedSize);
    const char *compressedRleBuf     = compressedDcBuf + 
                                  static_cast<ptrdiff_t>(dcCompressedSize);

    // 
    // Sanity check that the version is something we expect. Right now, 
    // we can decode version 0, 1, and 2. v1 adds 'end of block' symbols
    // to the AC RLE. v2 adds channel classification rules at the 
    // start of the data block.
    //

    if (version > 2)
        throw IEX_NAMESPACE::InputExc ("Invalid version of compressed data block");    

    setupChannelData(minX, minY, maxX, maxY);

    // 
    // Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN]
    //

    if (unknownCompressedSize > 0)
    {
        if (unknownUncompressedSize > _planarUncBufferSize[UNKNOWN]) 
        {
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                "(corrupt header).");
        }

        uLongf outSize = (uLongf)unknownUncompressedSize;

        if (Z_OK != ::uncompress
                        ((Bytef *)_planarUncBuffer[UNKNOWN],
                         &outSize,
                         (Bytef *)compressedUnknownBuf,
                         (uLong)unknownCompressedSize))
        {
            throw IEX_NAMESPACE::BaseExc("Error uncompressing UNKNOWN data.");
        }
    }

    // 
    // Uncompress the AC data into _packedAcBuffer
    //

    if (acCompressedSize > 0)
    {
        if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize)
        {
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                "(corrupt header).");
        }

        //
        // Don't trust the user to get it right, look in the file.
        //

        switch (acCompression)
        {
          case STATIC_HUFFMAN:

            hufUncompress
                (compressedAcBuf, 
                 (int)acCompressedSize, 
                 (unsigned short *)_packedAcBuffer, 
                 (int)totalAcUncompressedCount); 

            break;

          case DEFLATE:
            {
                uLongf destLen =
                    (int)(totalAcUncompressedCount) * sizeof (unsigned short);

                if (Z_OK != ::uncompress
                                ((Bytef *)_packedAcBuffer,
                                 &destLen,
                                 (Bytef *)compressedAcBuf,
                                 (uLong)acCompressedSize))
                {
                    throw IEX_NAMESPACE::InputExc ("Data decompression (zlib) failed.");
                }

                if (totalAcUncompressedCount * sizeof (unsigned short) !=
                                destLen)
                {
                    throw IEX_NAMESPACE::InputExc ("AC data corrupt.");     
                }
            }
            break;

          default:

            throw IEX_NAMESPACE::NoImplExc ("Unknown AC Compression");
            break;
        }
    }

    //
    // Uncompress the DC data into _packedDcBuffer
    //

    if (dcCompressedSize > 0)
    {
        if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize)
        {
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                "(corrupt header).");
        }

        if (_zip->uncompress
                    (compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer)
            != (int)totalDcUncompressedCount * sizeof (unsigned short))
        {
            throw IEX_NAMESPACE::BaseExc("DC data corrupt.");
        }
    }

    //
    // Uncompress the RLE data into _rleBuffer, then unRLE the results
    // into _planarUncBuffer[RLE]
    //

    if (rleRawSize > 0)
    {
        if (rleUncompressedSize > _rleBufferSize ||
            rleRawSize > _planarUncBufferSize[RLE])
        {
            throw IEX_NAMESPACE::InputExc("Error uncompressing DWA data"
                                "(corrupt header).");
        }
 
        uLongf dstLen = (uLongf)rleUncompressedSize;

        if (Z_OK != ::uncompress
                        ((Bytef *)_rleBuffer,
                         &dstLen,
                         (Bytef *)compressedRleBuf,
                         (uLong)rleCompressedSize))
        {
            throw IEX_NAMESPACE::BaseExc("Error uncompressing RLE data.");
        }

        if (dstLen != rleUncompressedSize)
            throw IEX_NAMESPACE::BaseExc("RLE data corrupted");

        if (rleUncompress
                ((int)rleUncompressedSize, 
                 (int)rleRawSize,
                 (signed char *)_rleBuffer,
                 _planarUncBuffer[RLE]) != rleRawSize)
        {        
            throw IEX_NAMESPACE::BaseExc("RLE data corrupted");
        }
    }

    //
    // Determine the start of each row in the output buffer
    //

    std::vector<bool> decodedChannels (_channelData.size());
    std::vector< std::vector<char *> > rowPtrs (_channelData.size());

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
        decodedChannels[chan] = false;

    outBufferEnd = _outBuffer;

    for (int y = minY; y <= maxY; ++y)
    {
        for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
        {
            ChannelData *cd = &_channelData[chan];

            if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
                continue;

            rowPtrs[chan].push_back (outBufferEnd);
            outBufferEnd += cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
        }
    }

    //
    // Setup to decode each block of 3 channels that need to
    // be handled together
    //

    for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
    {
        int rChan = _cscSets[csc].idx[0];    
        int gChan = _cscSets[csc].idx[1];    
        int bChan = _cscSets[csc].idx[2];    


        LossyDctDecoderCsc decoder
            (rowPtrs[rChan],
             rowPtrs[gChan],
             rowPtrs[bChan],
             packedAcBufferEnd,
             packedDcBufferEnd,
             get_dwaCompressorToLinear(),
             _channelData[rChan].width,
             _channelData[rChan].height,
             _channelData[rChan].type,
             _channelData[gChan].type,
             _channelData[bChan].type);

        decoder.execute();

        packedAcBufferEnd +=
            decoder.numAcValuesEncoded() * sizeof (unsigned short);

        packedDcBufferEnd +=
            decoder.numDcValuesEncoded() * sizeof (unsigned short);

        decodedChannels[rChan] = true;
        decodedChannels[gChan] = true;
        decodedChannels[bChan] = true;
    }

    //
    // Setup to handle the remaining channels by themselves
    //

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
    {
        if (decodedChannels[chan])
            continue;

        ChannelData *cd = &_channelData[chan];
        int pixelSize = OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);

        switch (cd->compression)
        {
          case LOSSY_DCT:

            //
            // Setup a single-channel lossy DCT decoder pointing
            // at the output buffer
            //

            {
                const unsigned short *linearLut = 0;

                if (!cd->pLinear)
                    linearLut = get_dwaCompressorToLinear();

                LossyDctDecoder decoder
                    (rowPtrs[chan],
                     packedAcBufferEnd,
                     packedDcBufferEnd,
                     linearLut,
                     cd->width,
                     cd->height,
                     cd->type);

                decoder.execute();   

                packedAcBufferEnd += 
                    decoder.numAcValuesEncoded() * sizeof (unsigned short);

                packedDcBufferEnd += 
                    decoder.numDcValuesEncoded() * sizeof (unsigned short);
            }

            break;

          case RLE:

            //
            // For the RLE case, the data has been un-RLE'd into
            // planarUncRleEnd[], but is still split out by bytes.
            // We need to rearrange the bytes back into the correct
            // order in the output buffer;
            //

            {
                int row = 0;

                for (int y = minY; y <= maxY; ++y)
                {
                    if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
                        continue;

                    char *dst = rowPtrs[chan][row];

                    if (pixelSize == 2)
                    {
                        interleaveByte2 (dst, 
                                         cd->planarUncRleEnd[0],
                                         cd->planarUncRleEnd[1],
                                         cd->width);
                                            
                        cd->planarUncRleEnd[0] += cd->width;
                        cd->planarUncRleEnd[1] += cd->width;
                    }
                    else
                    {
                        for (int x = 0; x < cd->width; ++x)
                        {
                            for (int byte = 0; byte < pixelSize; ++byte)
                            {
                               *dst++ = *cd->planarUncRleEnd[byte]++;
                            }
                        }
                    }

                    row++;
                }
            }

            break;

          case UNKNOWN:

            //
            // In the UNKNOWN case, data is already in planarUncBufferEnd
            // and just needs to copied over to the output buffer
            //

            {
                int row             = 0;
                int dstScanlineSize = cd->width * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);

                for (int y = minY; y <= maxY; ++y)
                {
                    if (IMATH_NAMESPACE::modp (y, cd->ySampling) != 0)
                        continue;

                    memcpy (rowPtrs[chan][row],
                            cd->planarUncBufferEnd,
                            dstScanlineSize);

                    cd->planarUncBufferEnd += dstScanlineSize;
                    row++;
                }
            }

            break;

          default:

            throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
            break;
        }

        decodedChannels[chan] = true;
    }

    //
    // Return a ptr to _outBuffer
    //

    outPtr = _outBuffer;
    return (int)(outBufferEnd - _outBuffer);
}


// static
void
DwaCompressor::initializeFuncs()
{
    convertFloatToHalf64 = convertFloatToHalf64_scalar;
    fromHalfZigZag       = fromHalfZigZag_scalar;

    CpuId cpuId;

    //
    // Setup HALF <-> FLOAT conversion implementations
    //

    if (cpuId.avx && cpuId.f16c)
    {
        convertFloatToHalf64 = convertFloatToHalf64_f16c;
        fromHalfZigZag       = fromHalfZigZag_f16c;
    } 

    //
    // Setup inverse DCT implementations
    //

    dctInverse8x8_0 = dctInverse8x8_scalar<0>;
    dctInverse8x8_1 = dctInverse8x8_scalar<1>;
    dctInverse8x8_2 = dctInverse8x8_scalar<2>;
    dctInverse8x8_3 = dctInverse8x8_scalar<3>;
    dctInverse8x8_4 = dctInverse8x8_scalar<4>;
    dctInverse8x8_5 = dctInverse8x8_scalar<5>;
    dctInverse8x8_6 = dctInverse8x8_scalar<6>;
    dctInverse8x8_7 = dctInverse8x8_scalar<7>;

    if (cpuId.avx) 
    {
        dctInverse8x8_0 = dctInverse8x8_avx<0>;
        dctInverse8x8_1 = dctInverse8x8_avx<1>;
        dctInverse8x8_2 = dctInverse8x8_avx<2>;
        dctInverse8x8_3 = dctInverse8x8_avx<3>;
        dctInverse8x8_4 = dctInverse8x8_avx<4>;
        dctInverse8x8_5 = dctInverse8x8_avx<5>;
        dctInverse8x8_6 = dctInverse8x8_avx<6>;
        dctInverse8x8_7 = dctInverse8x8_avx<7>;
    } 
    else if (cpuId.sse2) 
    {
        dctInverse8x8_0 = dctInverse8x8_sse2<0>;
        dctInverse8x8_1 = dctInverse8x8_sse2<1>;
        dctInverse8x8_2 = dctInverse8x8_sse2<2>;
        dctInverse8x8_3 = dctInverse8x8_sse2<3>;
        dctInverse8x8_4 = dctInverse8x8_sse2<4>;
        dctInverse8x8_5 = dctInverse8x8_sse2<5>;
        dctInverse8x8_6 = dctInverse8x8_sse2<6>;
        dctInverse8x8_7 = dctInverse8x8_sse2<7>;
    }
}


//
// Handle channel classification and buffer allocation once we know
// how to classify channels
//

void
DwaCompressor::initializeBuffers (size_t &outBufferSize)
{
    classifyChannels (_channels, _channelData, _cscSets);

    //
    // _outBuffer needs to be big enough to hold all our 
    // compressed data - which could vary depending on what sort
    // of channels we have. 
    //

    int maxOutBufferSize  = 0;
    int numLossyDctChans  = 0;
    int unknownBufferSize = 0;
    int rleBufferSize     = 0;

    int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) * 
                            (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
                            63 * sizeof (unsigned short);

    int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) * 
                            (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
                            sizeof (unsigned short);

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
    {
        switch (_channelData[chan].compression)
        {
          case LOSSY_DCT:

            //
            // This is the size of the number of packed
            // components, plus the requirements for
            // maximum Huffman encoding size (for STATIC_HUFFMAN)
            // or for zlib compression (for DEFLATE)
            //

            maxOutBufferSize += std::max(
                            (int)(2 * maxLossyDctAcSize + 65536),
                            (int)compressBound (maxLossyDctAcSize) );
            numLossyDctChans++;
            break;

          case RLE:
            {
                //
                // RLE, if gone horribly wrong, could double the size
                // of the source data.
                //

                int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) *
                                OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);

                rleBufferSize += rleAmount;
            }
            break;


          case UNKNOWN:

            unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) *
                                 OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
            break;

          default:

            throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
            break;
        }
    }

    //
    // Also, since the results of the RLE are packed into 
    // the output buffer, we need the extra room there. But
    // we're going to zlib compress() the data we pack, 
    // which could take slightly more space
    //

    maxOutBufferSize += (int)compressBound ((uLongf)rleBufferSize);
    
    //
    // And the same goes for the UNKNOWN data
    //

    maxOutBufferSize += (int)compressBound ((uLongf)unknownBufferSize);

    //
    // Allocate a zip/deflate compressor big enought to hold the DC data
    // and include it's compressed results in the size requirements
    // for our output buffer
    //

    if (_zip == 0) 
        _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
    else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans)
    {
        delete _zip;
        _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
    }


    maxOutBufferSize += _zip->maxCompressedSize();

    //
    // We also need to reserve space at the head of the buffer to 
    // write out the size of our various packed and compressed data.
    //

    maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64); 
                    

    //
    // Later, we're going to hijack outBuffer for the result of
    // both encoding and decoding. So it needs to be big enough
    // to hold either a buffers' worth of uncompressed or
    // compressed data
    //
    // For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes,
    // but for decoding, we only need it to be maxScanLineSize*numScanLines.
    // Cache the max size for now, and alloc the buffer when we either
    // encode or decode.
    //

    outBufferSize = maxOutBufferSize;


    //
    // _packedAcBuffer holds the quantized DCT coefficients prior
    // to Huffman encoding
    //

    if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize)
    {
        _packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans;
        if (_packedAcBuffer != 0) 
            delete[] _packedAcBuffer;
        _packedAcBuffer = new char[_packedAcBufferSize];
    }

    //
    // _packedDcBuffer holds one quantized DCT coef per 8x8 block
    //

    if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize)
    {
        _packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans;
        if (_packedDcBuffer != 0) 
            delete[] _packedDcBuffer;
        _packedDcBuffer     = new char[_packedDcBufferSize];
    }

    if (rleBufferSize > _rleBufferSize) 
    {
        _rleBufferSize = rleBufferSize;
        if (_rleBuffer != 0) 
            delete[] _rleBuffer;
        _rleBuffer = new char[rleBufferSize];
    }

    // 
    // The planar uncompressed buffer will hold float data for LOSSY_DCT
    // compressed values, and whatever the native type is for other
    // channels. We're going to use this to hold data in a planar
    // format, as opposed to the native interleaved format we take
    // into compress() and give back from uncompress().
    //
    // This also makes it easier to compress the UNKNOWN and RLE data
    // all in one swoop (for each compression scheme).
    //

    int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES];
    for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
        planarUncBufferSize[i] = 0;

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
    {
        switch (_channelData[chan].compression)
        {
          case LOSSY_DCT:
            break;

          case RLE:
            planarUncBufferSize[RLE] +=
                     numScanLines() * (_max[0] - _min[0] + 1) *
                     OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
            break;

          case UNKNOWN: 
            planarUncBufferSize[UNKNOWN] +=
                     numScanLines() * (_max[0] - _min[0] + 1) *
                     OPENEXR_IMF_NAMESPACE::pixelTypeSize (_channelData[chan].type);
            break;

          default:
            throw IEX_NAMESPACE::NoImplExc ("Unhandled compression scheme case");
            break;
        }
    }

    //
    // UNKNOWN data is going to be zlib compressed, which needs 
    // a little extra headroom
    //

    if (planarUncBufferSize[UNKNOWN] > 0)
    {
        planarUncBufferSize[UNKNOWN] = 
            compressBound ((uLongf)planarUncBufferSize[UNKNOWN]);
    }

    for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i)
    {
        if (planarUncBufferSize[i] > _planarUncBufferSize[i]) 
        {
            _planarUncBufferSize[i] = planarUncBufferSize[i];
            if (_planarUncBuffer[i] != 0) 
                delete[] _planarUncBuffer[i];
            _planarUncBuffer[i] = new char[planarUncBufferSize[i]];
        }
    }
}


//
// Setup channel classification rules to use when writing files
//

void
DwaCompressor::initializeDefaultChannelRules ()
{
    _channelRules.clear();

    _channelRules.push_back (Classifier ("R",     LOSSY_DCT, HALF,   0, false));
    _channelRules.push_back (Classifier ("R",     LOSSY_DCT, FLOAT,  0, false));
    _channelRules.push_back (Classifier ("G",     LOSSY_DCT, HALF,   1, false));
    _channelRules.push_back (Classifier ("G",     LOSSY_DCT, FLOAT,  1, false));
    _channelRules.push_back (Classifier ("B",     LOSSY_DCT, HALF,   2, false));
    _channelRules.push_back (Classifier ("B",     LOSSY_DCT, FLOAT,  2, false));

    _channelRules.push_back (Classifier ("Y",     LOSSY_DCT, HALF,  -1, false));
    _channelRules.push_back (Classifier ("Y",     LOSSY_DCT, FLOAT, -1, false));
    _channelRules.push_back (Classifier ("BY",    LOSSY_DCT, HALF,  -1, false));
    _channelRules.push_back (Classifier ("BY",    LOSSY_DCT, FLOAT, -1, false));
    _channelRules.push_back (Classifier ("RY",    LOSSY_DCT, HALF,  -1, false));
    _channelRules.push_back (Classifier ("RY",    LOSSY_DCT, FLOAT, -1, false));

    _channelRules.push_back (Classifier ("A",     RLE,       UINT,  -1, false));
    _channelRules.push_back (Classifier ("A",     RLE,       HALF,  -1, false));
    _channelRules.push_back (Classifier ("A",     RLE,       FLOAT, -1, false));
}


//
// Setup channel classification rules when reading files with VERSION < 2
//

void
DwaCompressor::initializeLegacyChannelRules ()
{
    _channelRules.clear();

    _channelRules.push_back (Classifier ("r",     LOSSY_DCT, HALF,   0, true));
    _channelRules.push_back (Classifier ("r",     LOSSY_DCT, FLOAT,  0, true));
    _channelRules.push_back (Classifier ("red",   LOSSY_DCT, HALF,   0, true));
    _channelRules.push_back (Classifier ("red",   LOSSY_DCT, FLOAT,  0, true));
    _channelRules.push_back (Classifier ("g",     LOSSY_DCT, HALF,   1, true));
    _channelRules.push_back (Classifier ("g",     LOSSY_DCT, FLOAT,  1, true));
    _channelRules.push_back (Classifier ("grn",   LOSSY_DCT, HALF,   1, true));
    _channelRules.push_back (Classifier ("grn",   LOSSY_DCT, FLOAT,  1, true));
    _channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF,   1, true));
    _channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT,  1, true));
    _channelRules.push_back (Classifier ("b",     LOSSY_DCT, HALF,   2, true));
    _channelRules.push_back (Classifier ("b",     LOSSY_DCT, FLOAT,  2, true));
    _channelRules.push_back (Classifier ("blu",   LOSSY_DCT, HALF,   2, true));
    _channelRules.push_back (Classifier ("blu",   LOSSY_DCT, FLOAT,  2, true));
    _channelRules.push_back (Classifier ("blue",  LOSSY_DCT, HALF,   2, true));
    _channelRules.push_back (Classifier ("blue",  LOSSY_DCT, FLOAT,  2, true));

    _channelRules.push_back (Classifier ("y",     LOSSY_DCT, HALF,  -1, true));
    _channelRules.push_back (Classifier ("y",     LOSSY_DCT, FLOAT, -1, true));
    _channelRules.push_back (Classifier ("by",    LOSSY_DCT, HALF,  -1, true));
    _channelRules.push_back (Classifier ("by",    LOSSY_DCT, FLOAT, -1, true));
    _channelRules.push_back (Classifier ("ry",    LOSSY_DCT, HALF,  -1, true));
    _channelRules.push_back (Classifier ("ry",    LOSSY_DCT, FLOAT, -1, true));
    _channelRules.push_back (Classifier ("a",     RLE,       UINT,  -1, true));
    _channelRules.push_back (Classifier ("a",     RLE,       HALF,  -1, true));
    _channelRules.push_back (Classifier ("a",     RLE,       FLOAT, -1, true));
}


// 
// Given a set of rules and ChannelData, figure out which rules apply
//

void
DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const 
{
    rules.clear();

    std::vector<std::string> suffixes;
    
    for (size_t cd = 0; cd < _channelData.size(); ++cd) 
    {
        std::string suffix  = _channelData[cd].name;
        size_t      lastDot = suffix.find_last_of ('.');

        if (lastDot != std::string::npos)
            suffix = suffix.substr (lastDot+1, std::string::npos);

        suffixes.push_back(suffix);
    }

    
    for (size_t i = 0; i < _channelRules.size(); ++i) 
    {
        for (size_t cd = 0; cd < _channelData.size(); ++cd) 
        {
            if (_channelRules[i].match (suffixes[cd], _channelData[cd].type ))
            {
                rules.push_back (_channelRules[i]);
                break;
            }
        }       
    }
}


//
// Take our initial list of channels, and cache the contents.
//
// Determine approprate compression schemes for each channel,
// and figure out which sets should potentially be CSC'ed 
// prior to lossy compression.
//

void
DwaCompressor::classifyChannels
    (ChannelList channels,
     std::vector<ChannelData> &chanData,
     std::vector<CscChannelSet> &cscData)
{
    //
    // prefixMap used to map channel name prefixes to 
    // potential CSC-able sets of channels.
    //

    std::map<std::string, DwaCompressor::CscChannelSet> prefixMap;
    std::vector<DwaCompressor::CscChannelSet>           tmpCscSet;

    unsigned int numChan = 0;

    for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
        numChan++;
    
    if (numChan)
        chanData.resize (numChan);

    //
    // Cache the relevant data from the channel structs.
    //

    unsigned int offset = 0;

    for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
    {
        chanData[offset].name        = std::string (c.name());
        chanData[offset].compression = UNKNOWN;
        chanData[offset].xSampling   = c.channel().xSampling;
        chanData[offset].ySampling   = c.channel().ySampling;
        chanData[offset].type        = c.channel().type;
        chanData[offset].pLinear     = c.channel().pLinear;

        offset++;
    }

    //
    // Try and figure out which channels should be
    // compressed by which means.
    //

    for (offset = 0; offset<numChan; ++offset)
    {
        std::string prefix  = "";
        std::string suffix  = chanData[offset].name;
        size_t      lastDot = suffix.find_last_of ('.');

        if (lastDot != std::string::npos)
        {
            prefix = suffix.substr (0,         lastDot);
            suffix = suffix.substr (lastDot+1, std::string::npos);
        } 

        //
        // Make sure we have an entry in our CSC set map 
        //

        std::map<std::string, DwaCompressor::CscChannelSet>::iterator 
            theSet = prefixMap.find (prefix);

        if (theSet == prefixMap.end())
        {
            DwaCompressor::CscChannelSet tmpSet;

            tmpSet.idx[0] = 
            tmpSet.idx[1] = 
            tmpSet.idx[2] = -1;

            prefixMap[prefix] = tmpSet;
        }

        // 
        // Check the suffix against the list of classifications
        // we defined previously. If the _cscIdx is not negative,
        // it indicates that we should be part of a CSC group.
        //

        for (std::vector<Classifier>::iterator i = _channelRules.begin();
             i != _channelRules.end();
             ++i)
        {
            if ( i->match(suffix, chanData[offset].type) )
            {
                chanData[offset].compression = i->_scheme;

                if ( i->_cscIdx >= 0)
                    prefixMap[prefix].idx[i->_cscIdx] = offset;
            }
        }
    }

    //
    // Finally, try and find RGB sets of channels which 
    // can be CSC'ed to a Y'CbCr space prior to loss, for
    // better compression.
    //
    // Walk over our set of candidates, and see who has
    // all three channels defined (and has common sampling
    // patterns, etc).
    //

    for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator 
         theItem = prefixMap.begin(); theItem != prefixMap.end();
         ++theItem)
    {
        int red = (*theItem).second.idx[0];
        int grn = (*theItem).second.idx[1];
        int blu = (*theItem).second.idx[2];

        if ((red < 0) || (grn < 0) || (blu < 0))
            continue;

        if ((chanData[red].xSampling != chanData[grn].xSampling) ||
            (chanData[red].xSampling != chanData[blu].xSampling) ||
            (chanData[grn].xSampling != chanData[blu].xSampling) ||
            (chanData[red].ySampling != chanData[grn].ySampling) ||
            (chanData[red].ySampling != chanData[blu].ySampling) ||
            (chanData[grn].ySampling != chanData[blu].ySampling))
        {
            continue;
        }
        
        tmpCscSet.push_back ((*theItem).second);
    }
    
    size_t numCsc = tmpCscSet.size();

    if (numCsc)
        cscData.resize(numCsc);

    for (offset = 0; offset < numCsc; ++offset)
        cscData[offset] = tmpCscSet[offset];
}


//
// Setup some buffer pointers, determine channel sizes, things
// like that.
//

void
DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY)
{
    char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES];

    for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
    {
        planarUncBuffer[i] = 0;

        if (_planarUncBuffer[i])
            planarUncBuffer[i] =  _planarUncBuffer[i];
    }

    for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
    {
        ChannelData *cd = &_channelData[chan];

        cd->width  = OPENEXR_IMF_NAMESPACE::numSamples (cd->xSampling, minX, maxX);
        cd->height = OPENEXR_IMF_NAMESPACE::numSamples (cd->ySampling, minY, maxY);
                                
        cd->planarUncSize =
            cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->type);
                                  
        cd->planarUncBuffer    = planarUncBuffer[cd->compression];
        cd->planarUncBufferEnd = cd->planarUncBuffer;

        cd->planarUncRle[0]    = cd->planarUncBuffer;
        cd->planarUncRleEnd[0] = cd->planarUncRle[0];

        for (int byte = 1; byte < OPENEXR_IMF_NAMESPACE::pixelTypeSize(cd->type); ++byte)
        {
            cd->planarUncRle[byte] = 
                         cd->planarUncRle[byte-1] + cd->width * cd->height;

            cd->planarUncRleEnd[byte] =
                         cd->planarUncRle[byte];
        }

        cd->planarUncType = cd->type;

        if (cd->compression == LOSSY_DCT)
        {
            cd->planarUncType = FLOAT;
        }
        else
        {
            planarUncBuffer[cd->compression] +=
                cd->width * cd->height * OPENEXR_IMF_NAMESPACE::pixelTypeSize (cd->planarUncType);
        }
    }
}

OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT